diff options
Diffstat (limited to 'proposals')
-rw-r--r-- | proposals/2019_ingest.md | 287 | ||||
-rw-r--r-- | proposals/2019_pdftotext_pdfinfo.md | 123 | ||||
-rw-r--r-- | proposals/20200129_pdf_ingest.md | 272 | ||||
-rw-r--r-- | proposals/20200207_pdftrio.md | 104 | ||||
-rw-r--r-- | proposals/20200211_nsq.md | 79 | ||||
-rw-r--r-- | proposals/20201012_no_capture.md | 36 | ||||
-rw-r--r-- | proposals/20201026_html_ingest.md | 129 | ||||
-rw-r--r-- | proposals/20201103_xml_ingest.md | 81 | ||||
-rw-r--r-- | proposals/2020_pdf_meta_thumbnails.md | 328 | ||||
-rw-r--r-- | proposals/2020_seaweed_s3.md | 426 | ||||
-rw-r--r-- | proposals/2021-04-22_crossref_db.md | 86 | ||||
-rw-r--r-- | proposals/2021-09-09_component_ingest.md | 114 | ||||
-rw-r--r-- | proposals/2021-09-13_src_ingest.md | 53 | ||||
-rw-r--r-- | proposals/schema_changes.sql | 40 |
14 files changed, 2158 insertions, 0 deletions
diff --git a/proposals/2019_ingest.md b/proposals/2019_ingest.md new file mode 100644 index 0000000..c649809 --- /dev/null +++ b/proposals/2019_ingest.md @@ -0,0 +1,287 @@ + +status: work-in-progress + +This document proposes structure and systems for ingesting (crawling) paper +PDFs and other content as part of sandcrawler. + +## Overview + +The main abstraction is a sandcrawler "ingest request" object, which can be +created and submitted to one of several systems for automatic harvesting, +resulting in an "ingest result" metadata object. This result should contain +enough metadata to be automatically imported into fatcat as a file/release +mapping. + +The structure and pipelines should be flexible enough to work with individual +PDF files, web captures, and datasets. It should work for on-demand +(interactive) ingest (for "save paper now" features), soft-real-time +(hourly/daily/queued), batches of hundreds or thousands of requests, and scale +up to batch ingest crawls of tens of millions of URLs. Most code should not +care about how or when content is actually crawled. + +The motivation for this structure is to consolidate and automate the current ad +hoc systems for crawling, matching, and importing into fatcat. It is likely +that there will still be a few special cases with their own importers, but the +goal is that in almost all cases that we discover a new structured source of +content to ingest (eg, a new manifest of identifiers to URLs), we can quickly +transform the task into a list of ingest requests, then submit those requests +to an automated system to have them archived and inserted into fatcat with as +little manual effort as possible. + +## Use Cases and Workflows + +### Unpaywall Example + +As a motivating example, consider how unpaywall crawls are done today: + +- download and archive JSON dump from unpaywall. transform and filter into a + TSV with DOI, URL, release-stage columns. +- filter out previously crawled URLs from this seed file, based on last dump, + with the intent of not repeating crawls unnecessarily +- run heritrix3 crawl, usually by sharding seedlist over multiple machines. + after crawl completes: + - backfill CDX PDF subset into hbase (for future de-dupe) + - generate CRL files etc and upload to archive items +- run arabesque over complete crawl logs. this takes time, is somewhat manual, + and has scaling issues past a few million seeds +- depending on source/context, run fatcat import with arabesque results +- periodically run GROBID (and other transforms) over all new harvested files + +Issues with this are: + +- seedlist generation and arabesque step are toilsome (manual), and arabesque + likely has metadata issues or otherwise "leaks" content +- brozzler pipeline is entirely separate +- results in re-crawls of content already in wayback, in particular links + between large corpuses + +New plan: + +- download dump, filter, transform into ingest requests (mostly the same as + before) +- load into ingest-request SQL table. only new rows (unique by source, type, + and URL) are loaded. run a SQL query for new rows from the source with URLs + that have not been ingested +- (optional) pre-crawl bulk/direct URLs using heritrix3, as before, to reduce + later load on SPN +- run ingest script over the above SQL output. ingest first hits CDX/wayback, + and falls back to SPNv2 (brozzler) for "hard" requests, or based on URL. + ingest worker handles file metadata, GROBID, any other processing. results go + to kafka, then SQL table +- either do a bulk fatcat import (via join query), or just have workers + continuously import into fatcat from kafka ingest feed (with various quality + checks) + +## Request/Response Schema + +For now, plan is to have a single request type, and multiple similar but +separate result types, depending on the ingest type (file, fileset, +webcapture). The initial use case is single file PDF ingest. + +NOTE: what about crawl requests where we don't know if we will get a PDF or +HTML? Or both? Let's just recrawl. + +*IngestRequest* + - `ingest_type`: required, one of `pdf`, `xml`, `html`, `dataset`. For + backwards compatibility, `file` should be interpreted as `pdf`. `pdf` and + `xml` return file ingest respose; `html` and `dataset` not implemented but + would be webcapture (wayback) and fileset (archive.org item or wayback?). + In the future: `epub`, `video`, `git`, etc. + - `base_url`: required, where to start crawl process + - `link_source`: recommended, slug string. indicating the database or "authority" + where URL/identifier match is coming from (eg, `doi`, `pmc`, `unpaywall` + (doi), `s2` (semantic-scholar id), `spn` (fatcat release), `core` (CORE + id), `mag` (MAG id)) + - `link_source_id`: recommended, identifier string. pairs with `link_source`. + - `ingest_request_source`: recommended, slug string. tracks the service or + user who submitted request. eg, `fatcat-changelog`, `editor_<ident>`, + `savepapernow-web` + - `release_stage`: optional. indicates the release stage of fulltext expected to be found at this URL + - `rel`: optional. indicates the link type + - `force_recrawl`: optional. if true, will always SPNv2 (won't check wayback) + - `oa_status`: optional. unpaywall schema + - `edit_extra`: additional metadata to be included in any eventual fatcat commits. + - `fatcat` + - `release_ident`: optional. if provided, indicates that ingest is expected + to be fulltext copy of this release (though may be a sibling release + under same work if `release_stage` doesn't match) + - `work_ident`: optional, unused. might eventually be used if, eg, + `release_stage` of ingested file doesn't match that of the `release_ident` + - `ext_ids`: matching fatcat schema. used for later lookups. sometimes + `link_source` and id are sufficient. + - `doi` + - `pmcid` + - ... + +*FileIngestResult* + - `request` (object): the full IngestRequest, copied + - `status` (slug): 'success', 'error', etc + - `hit` (boolean): whether we got something that looks like what was requested + - `terminal` (object): last crawled resource (if any) + - `terminal_url` (string; formerly `url`) + - `terminal_dt` (string): wayback capture datetime (string) + - `terminal_status_code` + - `terminal_sha1hex`: should match true `file_meta` SHA1 (not necessarily CDX SHA1) + (in case of transport encoding difference) + - `file_meta` (object): info about the terminal file + - same schema as sandcrawler-db table + - `size_bytes` + - `md5hex` + - `sha1hex` + - `sha256hex` + - `mimetype`: if not know, `application/octet-stream` + - `cdx`: CDX record matching terminal resource. *MAY* be a revisit or partial + record (eg, if via SPNv2) + - same schema as sandcrawler-db table + - `revisit_cdx` (optional): if `cdx` is a revisit record, this will be the + best "original" location for retrieval of the body (matching `flie_meta`) + - same schema as sandcrawler-db table + - `grobid` + - same schema as sandcrawler-db table + - `status` (string) + - `status_code` (int) + - `grobid_version` (string, from metadata) + - `fatcat_release` (string, from metadata) + - `metadata` (JSON) (with `grobid_version` and `fatcat_release` removed) + - NOT `tei_xml` (strip from reply) + - NOT `file_meta` (strip from reply) + +In general, it is the `terminal_dt` and `terminal_url` that should be used to +construct wayback links (eg, for insertion to fatcat), not from the `cdx`. + +## New SQL Tables + +Sandcrawler should persist status about: + +- claimed locations (links) to fulltext copies of in-scope works, from indexes + like unpaywall, MAG, semantic scholar, CORE + - with enough context to help insert into fatcat if works are crawled and + found. eg, external identifier that is indexed in fatcat, and + release-stage +- state of attempting to crawl all such links + - again, enough to insert into fatcat + - also info about when/how crawl happened, particularly for failures, so we + can do retries + +Proposing two tables: + + -- source/source_id examples: + -- unpaywall / doi + -- mag / mag_id + -- core / core_id + -- s2 / semanticscholar_id + -- doi / doi (for any base_url which is just https://doi.org/10..., regardless of why enqueued) + -- pmc / pmcid (for any base_url like europmc.org, regardless of why enqueued) + -- arxiv / arxiv_id (for any base_url like arxiv.org, regardless of why enqueued) + CREATE TABLE IF NOT EXISTS ingest_request ( + -- conceptually: source, source_id, ingest_type, url + -- but we use this order for PRIMARY KEY so we have a free index on type/URL + ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1), + base_url TEXT NOT NULL CHECK (octet_length(url) >= 1), + link_source TEXT NOT NULL CHECK (octet_length(link_source) >= 1), + link_source_id TEXT NOT NULL CHECK (octet_length(link_source_id) >= 1), + + created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + release_stage TEXT CHECK (octet_length(release_stage) >= 1), + request JSONB, + -- request isn't required, but can stash extra fields there for import, eg: + -- ext_ids (source/source_id sometimes enough) + -- release_ident (if ext_ids and source/source_id not specific enough; eg SPN) + -- edit_extra + -- rel + -- oa_status + -- ingest_request_source TEXT NOT NULL CHECK (octet_length(ingest_request_source) >= 1), + + PRIMARY KEY (ingest_type, base_url, link_source, link_source_id) + ); + + CREATE TABLE IF NOT EXISTS ingest_file_result ( + ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1), + base_url TEXT NOT NULL CHECK (octet_length(url) >= 1), + + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + hit BOOLEAN NOT NULL, + status TEXT + terminal_url TEXT, INDEX + terminal_dt TEXT + terminal_status_code INT + terminal_sha1hex TEXT, INDEX + + PRIMARY KEY (ingest_type, base_url) + ); + +## New Kafka Topics + +- `sandcrawler-ENV.ingest-file-requests` +- `sandcrawler-ENV.ingest-file-results` + +## Ingest Tool Design + +The basics of the ingest tool are to: + +- use native wayback python library to do fast/efficient lookups and redirect + lookups +- starting from base-url, do a fetch to either target resource or landing page: + follow redirects, at terminus should have both CDX metadata and response body + - if no capture, or most recent is too old (based on request param), do + SPNv2 (brozzler) fetches before wayback lookups +- if looking for PDF but got landing page (HTML), try to extract a PDF link + from HTML using various tricks, then do another fetch. limit this + recursion/spidering to just landing page (or at most one or two additional + hops) + +Note that if we pre-crawled with heritrix3 (with `citation_pdf_url` link +following), then in the large majority of simple cases we + +## Design Issues + +### Open Questions + +Do direct aggregator/repositories crawls need to go through this process? Eg +arxiv.org or pubmed central. I guess so, otherwise how do we get full file +metadata (size, other hashes)? + +When recording hit status for a URL (ingest result), is that status dependent +on the crawl context? Eg, for save-paper-now we might want to require GROBID. +Semantics of `hit` should probably be consistent: if we got the filetype +expected based on type, not whether we would actually import to fatcat. + +Where to include knowledge about, eg, single-page abstract PDFs being bogus? Do +we just block crawling, set an ingest result status, or only filter at fatcat +import time? Definitely need to filter at fatcat import time to make sure +things don't slip through elsewhere. + +### Yet Another PDF Harvester + +This system could result in "yet another" set of publisher-specific heuristics +and hacks to crawl publicly available papers. Related existing work includes +[unpaywall's crawler][unpaywall_crawl], LOCKSS extraction code, dissem.in's +efforts, zotero's bibliography extractor, etc. The "memento tracer" work is +also similar. Many of these are even in python! It would be great to reduce +duplicated work and maintenance. An analagous system in the wild is youtube-dl +for downloading video from many sources. + +[unpaywall_crawl]: https://github.com/ourresearch/oadoi/blob/master/webpage.py +[memento_tracer]: http://tracer.mementoweb.org/ + +One argument against this would be that our use-case is closely tied to +save-page-now, wayback, and the CDX API. However, a properly modular +implementation of a paper downloader would allow components to be re-used, and +perhaps dependency ingjection for things like HTTP fetches to allow use of SPN +or similar. Another argument for modularity would be support for headless +crawling (eg, brozzler). + +Note that this is an internal implementation detail; the ingest API would +abstract all this. + +## Test Examples + +Some example works that are difficult to crawl. Should have mechanisms to crawl +and unit tests for all these. + +- <https://pubs.acs.org> +- <https://linkinghub.elsevier.com> / <https://sciencedirect.com> +- <https://www.osapublishing.org/captcha/?guid=39B0E947-C0FC-B5D8-2C0C-CCF004FF16B8> +- <https://utpjournals.press/action/cookieAbsent> +- <https://academic.oup.com/jes/article/3/Supplement_1/SUN-203/5484104> +- <http://www.jcancer.org/v10p4038.htm> diff --git a/proposals/2019_pdftotext_pdfinfo.md b/proposals/2019_pdftotext_pdfinfo.md new file mode 100644 index 0000000..ed731a4 --- /dev/null +++ b/proposals/2019_pdftotext_pdfinfo.md @@ -0,0 +1,123 @@ + +status: brainstorming/backburner + +last updated: 2019-12-11 + +This document proposes changes to extract text and metadata from PDFs at ingest +time using pdftotext and pdfinfo, and storing this content in SQL and minio. + +This isn't a priority at the moment. Could be useful for fulltext search when +GROBID fails, and the pdfinfo output might help with other quality checks. + +## Overview / Motivation + +`pdfinfo` and `pdftotext` can both be run quickly over raw PDFs. In +sandcrawler, fetching PDFs can be a bit slow, so the motivation for caching the +text is just to not have to fetch the PDFs over and over. Metadata is useful to +store and index at scale. + +## pdfinfo output + +Example PDF info outputs: + + Creator: PDF Suite 2010 + Producer: PDF Suite 2010 + CreationDate: Tue Sep 24 23:03:58 2013 PDT + ModDate: Tue Sep 24 23:03:58 2013 PDT + Tagged: no + UserProperties: no + Suspects: no + Form: none + JavaScript: no + Pages: 17 + Encrypted: no + Page size: 612 x 792 pts (letter) + Page rot: 0 + File size: 105400 bytes + Optimized: no + PDF version: 1.4 + +another: + + Title: Miscellanea Zoologica Hungarica 8. 1993 (Budapest, 1993) + Author: L. Forró szerk. + Producer: ABBYY FineReader 9.0 Corporate Edition + CreationDate: Wed Apr 13 05:30:21 2011 PDT + ModDate: Wed Apr 13 09:53:27 2011 PDT + Tagged: yes + UserProperties: no + Suspects: no + Form: AcroForm + JavaScript: no + Pages: 13 + Encrypted: no + Page size: 473.76 x 678.42 pts + Page rot: 0 + File size: 12047270 bytes + Optimized: no + PDF version: 1.6 + +With the `-meta` flag, you get XML output, which also includes: + + <xmpMM:DocumentID>uuid:cd1a8daa-61e1-48f4-b679-26eac52bb6a9</xmpMM:DocumentID> + <xmpMM:InstanceID>uuid:dea54c78-8bc6-4f2f-a665-4cd7e62457e7</xmpMM:InstanceID> + +The document id is particularly interesting for fatcat/sandcrawler. Apparently +it is randomly created (or based on md5?) of first version of the file, and +persists across edits. A quality check would be that all files with the same +`document_id` should be clustered under the same fatcat work. + +All the info fields could probably be combined and used in categorization and +filtering (ML or heuristic). Eg, a PDF with forms is probably not research +output; published PDFs with specific "Producer" software probably are. + +## Fatcat Changes + +Could include in entity fields, a `pdfinfo` JSONB field, or existing `extra`: + +- pages +- words +- document id +- page size +- created +- other meta (eg, PDF title, author, etc) + +All of these fields are, I assume, deterministic, thus appropriate for +inclusion in fatcat. + +## New SQL Tables + + CREATE TABLE IF NOT EXISTS pdftotext ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + tool_version TEXT CHECK (octet_length(tool_version) >= 1), + text_success BOOLEAN NOT NULL, + text_words INT, + info_success BOOLEAN NOT NULL, + pages INT, + pdf_created TIMESTAMP WITH TIME ZONE, + document_id TEXT CHECK (octet_length(document_id) >= 1), -- XXX: always UUID? + metadata JSONB + -- metadata contains any other stuff from pdfinfo: + -- title + -- author + -- pdf version + -- page size (?) + -- instance_id + ); + -- CREATE INDEX pdftotext ON pdftotext(document_id); + +## New Kafka Topics + + sandcrawler-ENV.pdftotext-output + +Key would be sha1hex of PDF. + +Schema would match the SQL table, plus the full raw PDF text output. + +## New Minio Stuff + + /pdftotext/<hexbyte0>/<hexbyte1>/<sha1hex>.txt + +## Open Questions + diff --git a/proposals/20200129_pdf_ingest.md b/proposals/20200129_pdf_ingest.md new file mode 100644 index 0000000..9469217 --- /dev/null +++ b/proposals/20200129_pdf_ingest.md @@ -0,0 +1,272 @@ + +status: planned + +2020q1 Fulltext PDF Ingest Plan +=================================== + +This document lays out a plan and tasks for a push on crawling and ingesting +more fulltext PDF content in early 2020. + +The goal is to get the current generation of pipelines and matching tools +running smoothly by the end of March, when the Mellon phase 1 grant ends. As a +"soft" goal, would love to see over 25 million papers (works) with fulltext in +fatcat by that deadline as well. + +This document is organized by conceptual approach, then by jobs to run and +coding tasks needing work. + +There is a lot of work here! + + +## Broad OA Ingest By External Identifier + +There are a few million papers in fatacat which: + +1. have a DOI, arxiv id, or pubmed central id, which can be followed to a + landing page or directly to a PDF +2. are known OA, usually because publication is Gold OA +3. don't have any fulltext PDF in fatcat + +As a detail, some of these "known OA" journals actually have embargos (aka, +they aren't true Gold OA). In particular, those marked via EZB OA "color", and +recent pubmed central ids. + +Of these, I think there are broadly two categories. The first is just papers we +haven't tried directly crawling or ingesting yet at all; these should be easy +to crawl and ingest. The second category is papers from large publishers with +difficult to crawl landing pages (for example, Elsevier, IEEE, Wiley, ACM). The +later category will probably not crawl with heritrix, and we are likely to be +rate-limited or resource constrained when using brozzler or + +Coding Tasks: + +- improve `fatcat_ingest.py` script to allow more granular slicing and limiting + the number of requests enqueued per batch (eg, to allow daily partial + big-publisher ingests in random order). Allow dumping arxiv+pmcid ingest + requests. + +Actions: + +- run broad Datacite DOI landing crawl with heritrix ("pre-ingest") +- after Datacite crawl completes, run arabesque and ingest any PDF hits +- run broad non-Datacite DOI landing crawl with heritrix. Use ingest tool to + generate (or filter a dump), removing Datacite DOIs and large publishers +- after non-Datacite crawl completes, run entire ingest request set through in + bulk mode +- start enqueing large-publisher (hard to crawl) OA DOIs to ingest queue + for SPNv2 crawling (blocking ingest tool improvement, and also SPNv2 health) +- start new PUBMEDCENTRAL and ARXIV slow-burn pubmed crawls (heritrix). Use + updated ingest tool to generate requests. + + +## Large Seedlist Crawl Iterations + +We have a bunch of large, high quality seedlists, most of which haven't been +updated or crawled in a year or two. Some use DOIs as identifiers, some use an +internal identifier. As a quick summary: + +- unpaywall: currently 25 million DOIs (Crossref only?) with fulltext. URLs may + be doi.org, publisher landing page, or direct PDF; may be published version, + pre-print, or manuscript (indicated with a flag). Only crawled with heritrix; + last crawl was Spring 2019. There is a new dump from late 2019 with a couple + million new papers/URLs. +- microsoft academic (MAG): tens of millions of papers, hundreds of millions of + URLs. Last crawled 2018 (?) from a 2016 dump. Getting a new full dump via + Azure; new dump includes type info for each URL ("pdf", "landing page", etc). + Uses MAG id for each URL, not DOI; hoping new dump has better MAG/DOI + mappings. Expect a very large crawl (tens of millions of new URLs). +- CORE: can do direct crawling of PDFs from their site, as well as external + URLs. They largely have pre-prints and IR content. Have not released a dump + in a long time. Would expect a couple million new direct (core.ac.uk) URLs + and fewer new web URLs (often overlap with other lists, like MAG) +- semantic scholar: they do regular dumps. Use SHA1 hash of PDF as identifier; + it's the "best PDF of a group", so not always the PDF you crawl. Host many OA + PDFs on their domain, very fast to crawl, as well as wide-web URLs. Their + scope has increased dramatically in recent years due to MAG import; expect a + lot of overlap there. + +It is increasingly important to not + +Coding Tasks: +- transform scripts for all these seedlist sources to create ingest request + lists +- sandcrawler ingest request persist script, which supports setting datetime +- fix HBase thrift gateway so url agnostic de-dupe can be updated +- finish ingest worker "skip existing" code path, which looks in sandcrawler-db + to see if URL has already been processed (for efficiency) + +Actions: +- transform and persist all these old seedlists, with the URL datetime set to + roughly when the URL was added to the upstream corpus +- transform arabesque output for all old crawls into ingest requests and run + through the bulk ingest queue. expect GROBID to be skipped for all these, and + for the *requests* not to be updated (SQL ON CONFLICT DO NOTHING). Will + update ingest result table with status. +- fetch new MAG and unpaywall seedlists, transform to ingest requests, persist + into ingest request table. use SQL to dump only the *new* URLs (not seen in + previous dumps) using the created timestamp, outputing new bulk ingest + request lists. if possible, de-dupe between these two. then start bulk + heritrix crawls over these two long lists. Probably sharded over several + machines. Could also run serially (first one, then the other, with + ingest/de-dupe in between). Filter out usual large sites (core, s2, arxiv, + pubmed, etc) +- CORE and Semantic Scholar direct crawls, of only new URLs on their domain + (should not significantly conflict/dupe with other bulk crawls) + +After this round of big crawls completes we could do iterated crawling of +smaller seedlists, re-visit URLs that failed to ingest with updated heritrix +configs or the SPNv2 ingest tool, etc. + + +## GROBID/glutton Matching of Known PDFs + +Of the many PDFs in the sandcrawler CDX "working set", many were broadly +crawled or added via CDX heuristic. In other words, we don't have an identifier +from a seedlist. We previously run a matching script in Hadoop that attempted +to link these to Crossref DOIs based on GROBID extracted metadata. We haven't +done this in a long time; in the meanwhile we have added many more such PDFs, +added lots of metadata to our matching set (eg, pubmed and arxiv in addition to +crossref), and have the new biblio-glutton tool for matching, which may work +better than our old conservative tool. + +We have run GROBID+glutton over basically all of these PDFs. We should be able +to do a SQL query to select PDFs that: + +- have at least one known CDX row +- GROBID processed successfuly and glutton matched to a fatcat release +- do not have an existing fatcat file (based on sha1hex) +- output GROBID metadata, `file_meta`, and one or more CDX rows + +An update match importer can take this output and create new file entities. +Then lookup the release and confirm the match to the GROBID metadata, as well +as any other quality checks, then import into fatcat. We have some existing +filter code we could use. The verification code should be refactored into a +reusable method. + +It isn't clear to me how many new files/matches we would get from this, but +could do some test SQL queries to check. At least a million? + +A related task is to update the glutton lookup table (elasticsearch index and +on-disk lookup tables) after more recent metadata imports (Datacite, etc). +Unsure if we should filter out records or improve matching so that we don't +match "header" (paper) metadata to non-paper records (like datasets), but still +allow *reference* matching (citations to datasets). + +Coding Tasks: +- write SQL select function. Optionally, come up with a way to get multiple CDX + rows in the output (sub-query?) +- biblio metadata verify match function (between GROBID metadata and existing + fatcat release entity) +- updated match fatcat importer + +Actions: +- update `fatcat_file` sandcrawler table +- check how many PDFs this might ammount to. both by uniq SHA1 and uniq + `fatcat_release` matches +- do some manual random QA verification to check that this method results in + quality content in fatcat +- run full updated import + + +## No-Identifier PDF New Release Import Pipeline + +Previously, as part of longtail OA crawling work, I took a set of PDFs crawled +from OA journal homepages (where the publisher does not register DOIs), took +successful GROBID metadata, filtered for metadata quality, and imported about +1.5 million new release entities into fatcat. + +There were a number of metadata issues with this import that we are still +cleaning up, eg: + +- paper actually did have a DOI and should have been associated with existing + fatcat release entity; these PDFs mostly came from repository sites which + aggregated many PDFs, or due to unintentional outlink crawl configs +- no container linkage for any of these releases, making coverage tracking or + reporting difficult +- many duplicates in same import set, due to near-identical PDFs (different by + SHA-1, but same content and metadata), not merged or grouped in any way + +The cleanup process is out of scope for this document, but we want to do +another round of similar imports, while avoiding these problems. + +As a rouch sketch of what this would look like (may need to iterate): + +- filter to PDFs from longtail OA crawls (eg, based on WARC prefix, or URL domain) +- filter to PDFs not in fatcat already (in sandcrawler, then verify with lookup) +- filter to PDFs with successful GROBID extraction and *no* glutton match +- filter/clean GROBID extracted metadata (in python, not SQL), removing stubs + or poor/partial extracts +- run a fuzzy biblio metadata match against fatcat elasticsearch; use match + verification routine to check results +- if fuzzy match was a hit, consider importing directly as a matched file + (especially if there are no existing files for the release) +- identify container for PDF from any of: domain pattern/domain; GROBID + extracted ISSN or journal name; any other heuristic +- if all these filters pass and there was no fuzzy release match, and there was + a container match, import a new release (and the file) into fatcat + +Not entirely clear how to solve the near-duplicate issue. Randomize import +order (eg, sort by file sha1), import slowly with a single thread, and ensure +elasticsearch re-indexing pipeline is running smoothly so the fuzzy match will +find recently-imported hits? + +In theory we could use biblio-glutton API to do the matching lookups, but I +think it will be almost as fast to hit our own elasticsearch index. Also the +glutton backing store is always likely to be out of date. In the future we may +even write something glutton-compatible that hits our index. Note that this is +also very similar to how citation matching could work, though it might be +derailing or over-engineering to come up with a single solution for both +applications at this time. + +A potential issue here is that many of these papers are probably already in +another large but non-authoritative metadata corpus, like MAG, CORE, SHARE, or +BASE. Importing from those corpuses would want to go through the same fuzzy +matching to ensure we aren't creating duplicate releases, but further it would +be nice to be matching those external identifiers for any newly created +releases. One approach would be to bulk-import metadata from those sources +first. There are huge numbers of records in those corpuses, so we would need to +filter down by journal/container or OA flag first. Another would be to do fuzzy +matching when we *do* end up importing those corpuses, and update these records +with the external identifiers. This issue really gets at the crux of a bunch of +design issues and scaling problems with fatcat! But I think we should or need +to make progress on these longtail OA imports without perfectly solving these +larger issues. + +Details/Questions: +- what about non-DOI metadata sources like MAG, CORE, SHARE, BASE? Should we + import those first, or do fuzzy matching against those? +- use GROBID language detection and copy results to newly created releases +- in single-threaded, could cache "recently matched/imported releases" locally + to prevent double-importing +- cache container matching locally + +Coding Tasks: +- write SQL select statement +- iterate on GROBID metadata cleaning/transform/filter (have existing code for + this somewhere) +- implement a "fuzzy match" routine that takes biblio metadata (eg, GROBID + extracted), looks in fatcat elasticsearch for a match +- implement "fuzzy container match" routine, using as much available info as + possible. Could use chocula sqlite locally, or hit elasticsearch container + endpoint +- update GROBID importer to use fuzzy match and other checks + +Actions: +- run SQL select and estimate bounds on number of new releases created +- do some manual randomized QA runs to ensure this pipeline is importing + quality content in fatcat +- run a full batch import + + +## Non-authoritative Metadata and Fulltext from Aggregators + +This is not fully thought through, but at some point we will probably add one +or more large external aggregator metadata sources (MAG, Semantic Scholar, +CORE, SHARE, BASE), and bulk import both metadata records and fulltext at the +same time. The assumption is that those sources are doing the same fuzzy entity +merging/de-dupe and crawling we are doing, but they have already done it +(probably with more resources) and created stable identifiers that we can +include. + +A major blocker for most such imports is metadata licensing (fatcat is CC0, +others have restrictions). This may not be the case for CORE and SHARE though. diff --git a/proposals/20200207_pdftrio.md b/proposals/20200207_pdftrio.md new file mode 100644 index 0000000..31a2db6 --- /dev/null +++ b/proposals/20200207_pdftrio.md @@ -0,0 +1,104 @@ + +status: in progress + +PDF Trio (ML Classification) +============================== + +This document describes how we intent to integrate the first generation of PDF +classification work into the sandcrawler processing system. + +- abstractions (APIs) +- schemas +- how models and dependencies are deployed +- what code is release where under what license + + +## Code Structure + +Major components: + +**Training code, documentation, datasets:** Not used at run-time (does not need +to be deployed). Should be public. The datasets (PDFs) are copyrighted, so we +should only release URL lists that point to wayback. + +**Models:** all are static, uploaded to archive.org items, simple download to +deploy. Should be versioned, and have unique versioned file names or directory +paths (aka, deploy in parallel). + +**Image classifier backend:** vanilla tensorflow serving docker image, with a +bunch of invocation configs, plus static models. + +**BERT backend:** vanilla tensorflow serving docker image, plus config, plus +models. Basically same as image classifier. + +**API service:** currently Flask. Depends on tools like imagemagik, fasttext, +pdftotext. Seems like apt+pipenv should work? + + +## API Refactors + +Changes: + +- probably re-write README? +- refactor python code into directories +- add python tests +- tweak schema +- proper parallelization: uwsgi? async? + +New features: + +- option to send images, raw text in batches in addition to PDFs. + +## Client Code + +Basically just like GROBID client for now. Requests, JSON. + +## JSON Schema + +Output that goes in Kafka topic: + + key (sha1hex) + pdf_trio + status + status_code + ensemble_score + bert_score + image_score + linear_score + versions + pdftrio_version (string) + models_date (string, ISO date) + git_rev (string) + bert_model (string) + image_model (string) + linear_model (string) + timing (optional/future: as reported by API) + ... + file_meta + sha1hex + ... + timing + ... + + +## SQL Schema + +Ensemble model versions are summarized as a date. + + CREATE TABLE IF NOT EXISTS pdftrio ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + status_code INT NOT NULL, + status TEXT CHECK (octet_length(status) >= 1) NOT NULL, + pdftrio_version TEXT CHECK (octet_length(pdftrio_version) >= 1), + models_date DATE, + ensemble_score REAL, + bert_score REAL, + linear_score REAL, + image_score REAL + ); + +## Kafka Topic + +sandcrawler-qa.pdftrio-output + diff --git a/proposals/20200211_nsq.md b/proposals/20200211_nsq.md new file mode 100644 index 0000000..6aa885b --- /dev/null +++ b/proposals/20200211_nsq.md @@ -0,0 +1,79 @@ + +status: planned + +In short, Kafka is not working well as a job task scheduler, and I want to try +NSQ as a medium-term solution to that problem. + + +## Motivation + +Thinking of setting up NSQ to use for scheduling distributed work, to replace +kafka for some topics. for example, "regrobid" requests where we enqueue +millions of, basically, CDX lines, and want to process on dozens of cores or +multiple machines. or file ingest backfill. results would still go to kafka (to +persist), and pipelines like DOI harvest -> import -> elasticsearch would still +be kafka + +The pain point with kafka is having dozens of workers on tasks that take more +than a couple seconds per task. we could keep tweaking kafka and writing weird +consumer group things to handle this, but I think it will never work very well. +NSQ supports re-queues with delay (eg, on failure, defer to re-process later), +allows many workers to connect and leave with no disruption, messages don't +have to be processed in order, and has a very simple enqueue API (HTTP POST). + +The slowish tasks we have now are file ingest (wayback and/or SPNv2 + +GROBID) and re-GROBID. In the near future will also have ML backlog to go +through. + +Throughput isn't much of a concern as tasks take 10+ seconds each. + + +## Specific Plan + +Continue publishing ingest requests to Kafka topic. Have a new persist worker +consume from this topic and push to request table (but not result table) using +`ON CONFLICT DO NOTHING`. Have a new single-process kafka consumer pull from +the topic and push to NSQ. This consumer monitors NSQ and doesn't push too many +requests (eg, 1k maximum). NSQ could potentially even run as in-memory mode. +New worker/pusher class that acts as an NSQ client, possibly with parallelism. + +*Clean* NSQ shutdown/restart always persists data locally to disk. + +Unclean shutdown (eg, power failure) would mean NSQ might have lost state. +Because we are persisting requests to sandcrawler-db, cleanup is simple: +re-enqueue all requests from the past N days with null result or result older +than M days. + +Still need multiple kafka and NSQ topics to have priority queues (eg, bulk, +platform-specific). + +To start, have a single static NSQ host; don't need nsqlookupd. Could use +wbgrp-svc506 (datanode VM with SSD, lots of CPU and RAM). + +To move hosts, simply restart the kafka pusher pointing at the new NSQ host. +When the old host's queue is empty, restart the workers to consume from the new +host, and destroy the old NSQ host. + + +## Alternatives + +Work arounds i've done to date have been using the `grobid_tool.py` or +`ingest_tool.py` JSON input modes to pipe JSON task files (millions of lines) +through GNU/parallel. I guess GNU/parallel's distributed mode is also an option +here. + +Other things that could be used: + +**celery**: popular, many features. need to run separate redis, no disk persistence (?) + +**disque**: need to run redis, no disk persistence (?) <https://github.com/antirez/disque> + +**gearman**: <http://gearman.org/> no disk persistence (?) + + +## Old Notes + +TBD if would want to switch ingest requests from fatcat -> sandcrawler over, +and have the continuous ingests run out of NSQ, or keep using kafka for that. +currently can only do up to 10x parallelism or so with SPNv2, so that isn't a +scaling pain point diff --git a/proposals/20201012_no_capture.md b/proposals/20201012_no_capture.md new file mode 100644 index 0000000..bb47ea2 --- /dev/null +++ b/proposals/20201012_no_capture.md @@ -0,0 +1,36 @@ + +status: in-progress + +Storing no-capture missing URLs in `terminal_url` +================================================= + +Currently, when the bulk-mode ingest code terminates with a `no-capture` +status, the missing URL (which is not in GWB CDX) is not stored in +sandcrawler-db. This proposed change is to include it in the existing +`terminal_url` database column, with the `terminal_status_code` and +`terminal_dt` columns empty. + +The implementation is rather simple: + +- CDX lookup code path should save the *actual* final missing URL (`next_url` + after redirects) in the result object's `terminal_url` field +- ensure that this field gets passed through all the way to the database on the + `no-capture` code path + +This change does change the semantics of the `terminal_url` field somewhat, and +could break existing assumptions, so it is being documented in this proposal +document. + + +## Alternatives + +The current status quo is to store the missing URL as the last element in the +"hops" field of the JSON structure. We could keep this and have a convoluted +pipeline that would read from the Kafka feed and extract them, but this would +be messy. Eg, re-ingesting would not update the old kafka messages, so we could +need some accounting of consumer group offsets after which missing URLs are +truely missing. + +We could add a new `missing_url` database column and field to the JSON schema, +for this specific use case. This seems like unnecessary extra work. + diff --git a/proposals/20201026_html_ingest.md b/proposals/20201026_html_ingest.md new file mode 100644 index 0000000..785471b --- /dev/null +++ b/proposals/20201026_html_ingest.md @@ -0,0 +1,129 @@ + +status: deployed + +HTML Ingest Pipeline +======================== + +Basic goal: given an ingest request of type 'html', output an object (JSON) +which could be imported into fatcat. + +Should work with things like (scholarly) blog posts, micropubs, registrations, +protocols. Doesn't need to work with everything to start. "Platform" sites +(like youtube, figshare, etc) will probably be a different ingest worker. + +A current unknown is what the expected size of this metadata is. Both in number +of documents and amount of metadata per document. + +Example HTML articles to start testing: + +- complex distill article: <https://distill.pub/2020/bayesian-optimization/> +- old HTML journal: <http://web.archive.org/web/20081120141926fw_/http://www.mundanebehavior.org/issues/v5n1/rosen.htm> +- NIH pub: <https://www.nlm.nih.gov/pubs/techbull/ja02/ja02_locatorplus_merge.html> +- first mondays (OJS): <https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729> +- d-lib: <http://www.dlib.org/dlib/july17/williams/07williams.html> + + +## Ingest Process + +Follow base URL to terminal document, which is assumed to be a status=200 HTML document. + +Verify that terminal document is fulltext. Extract both metadata and fulltext. + +Extract list of sub-resources. Filter out unwanted (eg favicon, analytics, +unnecessary), apply a sanity limit. Convert to fully qualified URLs. For each +sub-resource, fetch down to the terminal resource, and compute hashes/metadata. + +Open questions: + +- will probably want to parallelize sub-resource fetching. async? +- behavior when failure fetching sub-resources + + +## Ingest Result Schema + +JSON should be basically compatible with existing `ingest_file_result` objects, +with some new sub-objects. + +Overall object (`IngestWebResult`): + +- `status`: str +- `hit`: bool +- `error_message`: optional, if an error +- `hops`: optional, array of URLs +- `cdx`: optional; single CDX row of primary HTML document +- `terminal`: optional; same as ingest result + - `terminal_url` + - `terminal_dt` + - `terminal_status_code` + - `terminal_sha1hex` +- `request`: optional but usually present; ingest request object, verbatim +- `file_meta`: optional; file metadata about primary HTML document +- `html_biblio`: optional; extracted biblio metadata from primary HTML document +- `scope`: optional; detected/guessed scope (fulltext, etc) +- `html_resources`: optional; array of sub-resources. primary HTML is not included +- `html_body`: optional; just the status code and some metadata is passed through; + actual document would go through a different KafkaTopic + - `status`: str + - `agent`: str, eg "trafilatura/0.4" + - `tei_xml`: optional, str + - `word_count`: optional, str + + +## New SQL Tables + +`html_meta` + sha1hex (primary key) + updated (of SQL row) + status + scope + has_teixml + has_thumbnail + word_count (from teixml fulltext) + biblio (JSON) + resources (JSON) + +Also writes to `ingest_file_result`, `file_meta`, and `cdx`, all only for the base HTML document. + +Note: needed to enable postgrest access to this table (for scholar worker). + + +## Fatcat API Wants + +Would be nice to have lookup by SURT+timestamp, and/or by sha1hex of terminal base file. + +`hide` option for cdx rows; also for fileset equivalent. + + +## New Workers + +Could reuse existing worker, have code branch depending on type of ingest. + +ingest file worker + => same as existing worker, because could be calling SPN + +persist result + => same as existing worker; adds persisting various HTML metadata + +persist html text + => talks to seaweedfs + + +## New Kafka Topics + +HTML ingest result topic (webcapture-ish) + +sandcrawler-ENV.html-teixml + JSON wrapping TEI-XML (same as other fulltext topics) + key compaction and content compression enabled + +JSON schema: + +- `key` and `sha1hex`: str; used as kafka key +- `status`: str +- `tei_xml`: str, optional +- `word_count`: int, optional + +## New S3/SeaweedFS Content + +`sandcrawler` bucket, `html` folder, `.tei.xml` suffix. + diff --git a/proposals/20201103_xml_ingest.md b/proposals/20201103_xml_ingest.md new file mode 100644 index 0000000..181cc11 --- /dev/null +++ b/proposals/20201103_xml_ingest.md @@ -0,0 +1,81 @@ + +status: wip + +TODO: +x XML fulltext URL extractor (based on HTML biblio metadata, not PDF url extractor) +x differential JATS XML and scielo XML from generic XML? + application/xml+jats is what fatcat is doing for abstracts + but it should be application/jats+xml? + application/tei+xml + if startswith "<article " and "<article-meta>" => JATS +x refactor ingest worker to be more general +x have ingest code publish body to kafka topic +x write a persist worker +/ create/configure kafka topic +- test everything locally +- fatcat: ingest tool to create requests +- fatcat: entity updates worker creates XML ingest requests for specific sources +- fatcat: ingest file import worker allows XML results +- ansible: deployment of persist worker + +XML Fulltext Ingest +==================== + +This document details changes to include XML fulltext ingest in the same way +that we currently ingest PDF fulltext. + +Currently this will just fetch the single XML document, which is often lacking +figures, tables, and other required files. + +## Text Encoding + +Because we would like to treat XML as a string in a couple contexts, but XML +can have multiple encodings (indicated in an XML header), we are in a bit of a +bind. Simply parsing into unicode and then re-encoding as UTF-8 could result in +a header/content mismatch. Any form of re-encoding will change the hash of the +document. For recording in fatcat, the file metadata will be passed through. +For storing in Kafka and blob store (for downstream analysis), we will parse +the raw XML document (as "bytes") with an XML parser, then re-output with UTF-8 +encoding. The hash of the *original* XML file will be used as the key for +refering to this document. This is unintuitive, but similar to what we are +doing with PDF and HTML documents (extracting in a useful format, but keeping +the original document's hash as a key). + +Unclear if we need to do this re-encode process for XML documents already in +UTF-8 encoding. + +## Ingest Worker + +Could either re-use HTML metadata extractor to fetch XML fulltext links, or +fork that code off to a separate method, like the PDF fulltext URL extractor. + +Hopefully can re-use almost all of the PDF pipeline code, by making that ingest +worker class more generic and subclassing it. + +Result objects are treated the same as PDF ingest results: the result object +has context about status, and if successful, file metadata and CDX row of the +terminal object. + +TODO: should it be assumed that XML fulltext will end up in S3 bucket? or +should there be an `xml_meta` SQL table tracking this, like we have for PDFs +and HTML? + +TODO: should we detect and specify the XML schema better? Eg, indicate if JATS. + + +## Persist Pipeline + +### Kafka Topic + +sandcrawler-ENV.xml-doc + similar to other fulltext topics; JSON wrapping the XML + key compaction, content compression + +### S3/SeaweedFS + +`sandcrawler` bucket, `xml` folder. Extension could depend on sub-type of XML? + +### Persist Worker + +New S3-only worker that pulls from kafka topic and pushes to S3. Works +basically the same as PDF persist in S3-only mode, or like pdf-text worker. diff --git a/proposals/2020_pdf_meta_thumbnails.md b/proposals/2020_pdf_meta_thumbnails.md new file mode 100644 index 0000000..793d6b5 --- /dev/null +++ b/proposals/2020_pdf_meta_thumbnails.md @@ -0,0 +1,328 @@ + +status: work-in-progress + +New PDF derivatives: thumbnails, metadata, raw text +=================================================== + +To support scholar.archive.org (fulltext search) and other downstream uses of +fatcat, want to extract from many PDFs: + +- pdf structured metadata +- thumbnail images +- raw extracted text + +A single worker should extract all of these fields, and publish in to two kafka +streams. Separate persist workers consume from the streams and push in to SQL +and/or seaweedfs. + +Additionally, this extraction should happen automatically for newly-crawled +PDFs as part of the ingest pipeline. When possible, checks should be run +against the existing SQL table to avoid duplication of processing. + + +## PDF Metadata and Text + +Kafka topic (name: `sandcrawler-ENV.pdf-text`; 12x partitions; gzip +compression) JSON schema: + + sha1hex (string; used as key) + status (string) + text (string) + page0_thumbnail (boolean) + meta_xml (string) + pdf_info (object) + pdf_extra (object) + word_count + file_meta (object) + source (object) + +For the SQL table we should have columns for metadata fields that are *always* +saved, and put a subset of other interesting fields in a JSON blob. We don't +need all metadata fields in SQL. Full metadata/info will always be available in +Kafka, and we don't want SQL table size to explode. Schema: + + CREATE TABLE IF NOT EXISTS pdf_meta ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + status TEXT CHECK (octet_length(status) >= 1) NOT NULL, + has_page0_thumbnail BOOLEAN NOT NULL, + page_count INT CHECK (page_count >= 0), + word_count INT CHECK (word_count >= 0), + page0_height REAL CHECK (page0_height >= 0), + page0_width REAL CHECK (page0_width >= 0), + permanent_id TEXT CHECK (octet_length(permanent_id) >= 1), + pdf_created TIMESTAMP WITH TIME ZONE, + pdf_version TEXT CHECK (octet_length(pdf_version) >= 1), + metadata JSONB + -- maybe some analysis of available fields? + -- metadata JSON fields: + -- title + -- subject + -- author + -- creator + -- producer + -- CrossMarkDomains + -- doi + -- form + -- encrypted + ); + + +## Thumbnail Images + +Kafka Schema is raw image bytes as message body; sha1sum of PDF as the key. No +compression, 12x partitions. + +Kafka topic name is `sandcrawler-ENV.pdf-thumbnail-SIZE-TYPE` (eg, +`sandcrawler-qa.pdf-thumbnail-180px-jpg`). Thus, topic name contains the +"metadata" of thumbail size/shape. + +Have decided to use JPEG thumbnails, 180px wide (and max 300px high, though +width restriction is almost always the limiting factor). This size matches that +used on archive.org, and is slightly larger than the thumbnails currently used +on scholar.archive.org prototype. We intend to tweak the scholar.archive.org +CSS to use the full/raw thumbnail image at max desktop size. At this size it +would be difficult (though maybe not impossible?) to extract text (other than +large-font titles). + + +### Implementation + +We use the `poppler` CPP library (wrapper for python) to extract and convert everything. + +Some example usage of the `python-poppler` library: + + import poppler + from PIL import Image + + pdf = poppler.load_from_file("/home/bnewbold/10.1038@s41551-020-0534-9.pdf") + pdf.pdf_id + page = pdf.create_page(0) + page.page_rect().width + + renderer = poppler.PageRenderer() + full_page = renderer.render_page(page) + img = Image.frombuffer("RGBA", (full_page.width, full_page.height), full_page.data, 'raw', "RGBA") + img.thumbnail((180,300), Image.BICUBIC) + img.save("something.jpg") + +## Deployment and Infrastructure + +Deployment will involve: + +- sandcrawler DB SQL table + => guesstimate size 100 GByte for hundreds of PDFs +- postgrest/SQL access to new table for internal HTTP API hits +- seaweedfs raw text folder + => reuse existing bucket with GROBID XML; same access restrictions on content +- seaweedfs thumbnail bucket + => new bucket for this world-public content +- public nginx access to seaweed thumbnail bucket +- extraction work queue kafka topic + => same schema/semantics as ungrobided +- text/metadata kafka topic +- thumbnail kafka topic +- text/metadata persist worker(s) + => from kafka; metadata to SQL database; text to seaweedfs blob store +- thumbnail persist worker + => from kafka to seaweedfs blob store +- pdf extraction worker pool + => very similar to GROBID worker pool +- ansible roles for all of the above + +Plan for processing/catchup is: + +- test with COVID-19 PDF corpus +- run extraction on all current fatcat files avaiable via IA +- integrate with ingest pipeline for all new files +- run a batch catchup job over all GROBID-parsed files with no pdf meta + extracted, on basis of SQL table query + +## Appendix: Thumbnail Size and Format Experimentation + +Using 190 PDFs from `/data/pdfs/random_crawl/files` on my laptop to test. + +TODO: actually, 4x images failed to convert with pdftocairo; this throws off +"mean" sizes by a small amount. + + time ls | parallel -j1 pdftocairo -singlefile -scale-to 200 -png {} /tmp/test-png/{}.png + real 0m29.314s + user 0m26.794s + sys 0m2.484s + => missing: 4 + => min: 0.8k + => max: 57K + => mean: 16.4K + => total: 3120K + + time ls | parallel -j1 pdftocairo -singlefile -scale-to 200 -jpeg {} /tmp/test-jpeg/{}.jpg + real 0m26.289s + user 0m24.022s + sys 0m2.490s + => missing: 4 + => min: 1.2K + => max: 13K + => mean: 8.02k + => total: 1524K + + time ls | parallel -j1 pdftocairo -singlefile -scale-to 200 -jpeg -jpegopt optimize=y,quality=80 {} /tmp/test-jpeg2/{}.jpg + real 0m27.401s + user 0m24.941s + sys 0m2.519s + => missing: 4 + => min: 577 + => max: 14K + => mean: + => total: 1540K + + time ls | parallel -j1 convert -resize 200x200 {}[0] /tmp/magick-png/{}.png + => missing: 4 + real 1m19.399s + user 1m17.150s + sys 0m6.322s + => min: 1.1K + => max: 325K + => mean: + => total: 8476K + + time ls | parallel -j1 convert -resize 200x200 {}[0] /tmp/magick-jpeg/{}.jpg + real 1m21.766s + user 1m17.040s + sys 0m7.155s + => total: 3484K + +NOTE: the following `pdf_thumbnail.py` images are somewhat smaller than the above +jpg and pngs (max 180px wide, not 200px wide) + + time ls | parallel -j1 ~/code/sandcrawler/python/scripts/pdf_thumbnail.py {} /tmp/python-png/{}.png + real 0m48.198s + user 0m42.997s + sys 0m4.509s + => missing: 2; 2x additional stub images + => total: 5904K + + time ls | parallel -j1 ~/code/sandcrawler/python/scripts/pdf_thumbnail.py {} /tmp/python-jpg/{}.jpg + real 0m45.252s + user 0m41.232s + sys 0m4.273s + => min: 1.4K + => max: 16K + => mean: ~9.3KByte + => total: 1772K + + time ls | parallel -j1 ~/code/sandcrawler/python/scripts/pdf_thumbnail.py {} /tmp/python-jpg-360/{}.jpg + real 0m48.639s + user 0m44.121s + sys 0m4.568s + => mean: ~28k + => total: 5364K (3x of 180px batch) + + quality=95 + time ls | parallel -j1 ~/code/sandcrawler/python/scripts/pdf_thumbnail.py {} /tmp/python-jpg2-360/{}.jpg + real 0m49.407s + user 0m44.607s + sys 0m4.869s + => total: 9812K + + quality=95 + time ls | parallel -j1 ~/code/sandcrawler/python/scripts/pdf_thumbnail.py {} /tmp/python-jpg2-180/{}.jpg + real 0m45.901s + user 0m41.486s + sys 0m4.591s + => mean: 16.4K + => total: 3116K + +At the 180px size, the difference between default and quality=95 seems +indistinguishable visually to me, but is more than a doubling of file size. +Also tried at 300px and seems near-indistinguishable there as well. + +At a mean of 10 Kbytes per file: + + 10 million -> 100 GBytes + 100 million -> 1 Tbyte + +Older COVID-19 thumbnails were about 400px wide: + + pdftocairo -png -singlefile -scale-to-x 400 -scale-to-y -1 + +Display on scholar-qa.archive.org is about 135x181px + +archive.org does 180px wide + +Unclear if we should try to do double resolution for high DPI screens (eg, +apple "retina"). + +Using same size as archive.org probably makes the most sense: max 180px wide, +preserve aspect ratio. And jpeg improvement seems worth it. + +#### Merlijn notes + +From work on optimizing microfilm thumbnail images: + + When possible, generate a thumbnail that fits well on the screen of the + user. Always creating a large thumbnail will result in the browsers + downscaling them, leading to fuzzy text. If it’s not possible, then create + the pick the resolution you’d want to support (1.5x or 2x scaling) and + create thumbnails of that size, but also apply the other recommendations + below - especially a sharpening filter. + + Use bicubic or lanczos interpolation. Bilinear and nearest neighbour are + not OK. + + For text, consider applying a sharpening filter. Not a strong one, but some + sharpening can definitely help. + + +## Appendix: PDF Info Fields + +From `pdfinfo` manpage: + + The ´Info' dictionary contains the following values: + + title + subject + keywords + author + creator + producer + creation date + modification date + + In addition, the following information is printed: + + tagged (yes/no) + form (AcroForm / XFA / none) + javascript (yes/no) + page count + encrypted flag (yes/no) + print and copy permissions (if encrypted) + page size + file size + linearized (yes/no) + PDF version + metadata (only if requested) + +For an example file, the output looks like: + + Title: A mountable toilet system for personalized health monitoring via the analysis of excreta + Subject: Nature Biomedical Engineering, doi:10.1038/s41551-020-0534-9 + Keywords: + Author: Seung-min Park + Creator: Springer + CreationDate: Thu Mar 26 01:26:57 2020 PDT + ModDate: Thu Mar 26 01:28:06 2020 PDT + Tagged: no + UserProperties: no + Suspects: no + Form: AcroForm + JavaScript: no + Pages: 14 + Encrypted: no + Page size: 595.276 x 790.866 pts + Page rot: 0 + File size: 6104749 bytes + Optimized: yes + PDF version: 1.4 + +For context on the `pdf_id` fields ("original" and "updated"), read: +<https://web.hypothes.is/blog/synchronizing-annotations-between-local-and-remote-pdfs/> diff --git a/proposals/2020_seaweed_s3.md b/proposals/2020_seaweed_s3.md new file mode 100644 index 0000000..5f4ff0b --- /dev/null +++ b/proposals/2020_seaweed_s3.md @@ -0,0 +1,426 @@ +# Notes on seaweedfs + +> 2020-04-28, martin@archive.org + +Currently (04/2020) [minio](https://github.com/minio/minio) is used to store +output from PDF analysis for [fatcat](https://fatcat.wiki) (e.g. from +[grobid](https://grobid.readthedocs.io/en/latest/)). The file checksum (sha1) +serves as key, values are blobs of XML or JSON. + +Problem: minio inserts slowed down after inserting 80M or more objects. + +Summary: I did four test runs, three failed, one (testrun-4) succeeded. + +* [testrun-4](https://git.archive.org/webgroup/sandcrawler/-/blob/master/proposals/2020_seaweed_s3.md#testrun-4) + +So far, in a non-distributed mode, the project looks usable. Added 200M objects +(about 550G) in 6 days. Full CPU load, 400M RAM usage, constant insert times. + +---- + +Details (03/2020) / @bnewbold, slack + +> the sandcrawler XML data store (currently on aitio) is grinding to a halt, I +> think because despite tuning minio+ext4+hdd just doesn't work. current at 2.6 +> TiB of data (each document compressed with snappy) and 87,403,183 objects. + +> this doesn't impact ingest processing (because content is queued and archived +> in kafka), but does impact processing and analysis + +> it is possible that the other load on aitio is making this worse, but I did +> an experiment with dumping to a 16 TB disk that slowed way down after about +> 50 million files also. some people on the internet said to just not worry +> about these huge file counts on modern filesystems, but i've debugged a bit +> and I think it is a bad idea after all + +Possible solutions + +* putting content in fake WARCs and trying to do something like CDX +* deploy CEPH object store (or swift, or any other off-the-shelf object store) +* try putting the files in postgres tables, mongodb, cassandra, etc: these are + not designed for hundreds of millions of ~50 KByte XML documents (5 - 500 + KByte range) +* try to find or adapt an open source tool like Haystack, Facebook's solution + to this engineering problem. eg: + https://engineering.linkedin.com/blog/2016/05/introducing-and-open-sourcing-ambry---linkedins-new-distributed- + +---- + +The following are notes gathered during a few test runs of seaweedfs in 04/2020 +on wbgrp-svc170.us.archive.org (4 core E5-2620 v4, 4GB RAM). + +---- + +## Setup + +There are frequent [releases](https://github.com/chrislusf/seaweedfs/releases) +but for the test, we used a build off master branch. + +Directions for configuring AWS CLI for seaweedfs: +[https://github.com/chrislusf/seaweedfs/wiki/AWS-CLI-with-SeaweedFS](https://github.com/chrislusf/seaweedfs/wiki/AWS-CLI-with-SeaweedFS). + +### Build the binary + +Using development version (requires a [Go installation](https://golang.org/dl/)). + +``` +$ git clone git@github.com:chrislusf/seaweedfs.git # 11f5a6d9 +$ cd seaweedfs +$ make +$ ls -lah weed/weed +-rwxr-xr-x 1 tir tir 55M Apr 17 16:57 weed + +$ git rev-parse HEAD +11f5a6d91346e5f3cbf3b46e0a660e231c5c2998 + +$ sha1sum weed/weed +a7f8f0b49e6183da06fc2d1411c7a0714a2cc96b +``` + +A single, 55M binary emerges after a few seconds. The binary contains +subcommands to run different parts of seaweed, e.g. master or volume servers, +filer and commands for maintenance tasks, like backup and compaction. + +To *deploy*, just copy this binary to the destination. + +### Quickstart with S3 + +Assuming `weed` binary is in PATH. + +Start a master and volume server (over /tmp, most likely) and the S3 API with a single command: + +``` +$ weed -server s3 +... +Start Seaweed Master 30GB 1.74 at 0.0.0.0:9333 +... +Store started on dir: /tmp with 0 volumes max 7 +Store started on dir: /tmp with 0 ec shards +Volume server start with seed master nodes: [localhost:9333] +... +Start Seaweed S3 API Server 30GB 1.74 at http port 8333 +... +``` + +Install the [AWS +CLI](https://github.com/chrislusf/seaweedfs/wiki/AWS-CLI-with-SeaweedFS). +Create a bucket. + +``` +$ aws --endpoint-url http://localhost:8333 s3 mb s3://sandcrawler-dev +make_bucket: sandcrawler-dev +``` + +List buckets. + +``` +$ aws --endpoint-url http://localhost:8333 s3 ls +2020-04-17 17:44:39 sandcrawler-dev +``` + +Create a dummy file. + +``` +$ echo "blob" > 12340d9a4a4f710ecf03b127051814385e83ff08.tei.xml +``` + +Upload. + +``` +$ aws --endpoint-url http://localhost:8333 s3 cp 12340d9a4a4f710ecf03b127051814385e83ff08.tei.xml s3://sandcrawler-dev +upload: ./12340d9a4a4f710ecf03b127051814385e83ff08.tei.xml to s3://sandcrawler-dev/12340d9a4a4f710ecf03b127051814385e83ff08.tei.xml +``` + +List. + +``` +$ aws --endpoint-url http://localhost:8333 s3 ls s3://sandcrawler-dev +2020-04-17 17:50:35 5 12340d9a4a4f710ecf03b127051814385e83ff08.tei.xml +``` + +Stream to stdout. + +``` +$ aws --endpoint-url http://localhost:8333 s3 cp s3://sandcrawler-dev/12340d9a4a4f710ecf03b127051814385e83ff08.tei.xml - +blob +``` + +Drop the bucket. + +``` +$ aws --endpoint-url http://localhost:8333 s3 rm --recursive s3://sandcrawler-dev +``` + +### Builtin benchmark + +The project comes with a builtin benchmark command. + +``` +$ weed benchmark +``` + +I encountered an error like +[#181](https://github.com/chrislusf/seaweedfs/issues/181), "no free volume +left" - when trying to start the benchmark after the S3 ops. A restart or a restart with `-volume.max 100` helped. + +``` +$ weed server -s3 -volume.max 100 +``` + +### Listing volumes + +``` +$ weed shell +> volume.list +Topology volume:15/112757 active:8 free:112742 remote:0 volumeSizeLimit:100 MB + DataCenter DefaultDataCenter volume:15/112757 active:8 free:112742 remote:0 + Rack DefaultRack volume:15/112757 active:8 free:112742 remote:0 + DataNode localhost:8080 volume:15/112757 active:8 free:112742 remote:0 + volume id:1 size:105328040 collection:"test" file_count:33933 version:3 modified_at_second:1587215730 + volume id:2 size:106268552 collection:"test" file_count:34236 version:3 modified_at_second:1587215730 + volume id:3 size:106290280 collection:"test" file_count:34243 version:3 modified_at_second:1587215730 + volume id:4 size:105815368 collection:"test" file_count:34090 version:3 modified_at_second:1587215730 + volume id:5 size:105660168 collection:"test" file_count:34040 version:3 modified_at_second:1587215730 + volume id:6 size:106296488 collection:"test" file_count:34245 version:3 modified_at_second:1587215730 + volume id:7 size:105753288 collection:"test" file_count:34070 version:3 modified_at_second:1587215730 + volume id:8 size:7746408 file_count:12 version:3 modified_at_second:1587215764 + volume id:9 size:10438760 collection:"test" file_count:3363 version:3 modified_at_second:1587215788 + volume id:10 size:10240104 collection:"test" file_count:3299 version:3 modified_at_second:1587215788 + volume id:11 size:10258728 collection:"test" file_count:3305 version:3 modified_at_second:1587215788 + volume id:12 size:10240104 collection:"test" file_count:3299 version:3 modified_at_second:1587215788 + volume id:13 size:10112840 collection:"test" file_count:3258 version:3 modified_at_second:1587215788 + volume id:14 size:10190440 collection:"test" file_count:3283 version:3 modified_at_second:1587215788 + volume id:15 size:10112840 collection:"test" file_count:3258 version:3 modified_at_second:1587215788 + DataNode localhost:8080 total size:820752408 file_count:261934 + Rack DefaultRack total size:820752408 file_count:261934 + DataCenter DefaultDataCenter total size:820752408 file_count:261934 +total size:820752408 file_count:261934 +``` + +### Custom S3 benchmark + +To simulate the use case of S3 for 100-500M small files (grobid xml, pdftotext, +...), I created a synthetic benchmark. + +* [https://gist.github.com/miku/6f3fee974ba82083325c2f24c912b47b](https://gist.github.com/miku/6f3fee974ba82083325c2f24c912b47b) + +We just try to fill up the datastore with millions of 5k blobs. + +---- + +### testrun-1 + +Small set, just to run. Status: done. Learned that the default in-memory volume +index grows too quickly for the 4GB RAM machine. + +``` +$ weed server -dir /tmp/martin-seaweedfs-testrun-1 -s3 -volume.max 512 -master.volumeSizeLimitMB 100 +``` + +* https://github.com/chrislusf/seaweedfs/issues/498 -- RAM +* at 10M files, we already consume ~1G + +``` +-volume.index string + Choose [memory|leveldb|leveldbMedium|leveldbLarge] mode for memory~performance balance. (default "memory") +``` + +### testrun-2 + +200M 5k objects, in-memory volume index. Status: done. Observed: After 18M +objects the 512 100MB volumes are exhausted and seaweedfs will not accept any +new data. + +``` +$ weed server -dir /tmp/martin-seaweedfs-testrun-2 -s3 -volume.max 512 -master.volumeSizeLimitMB 100 +... +I0418 12:01:43 1622 volume_loading.go:104] loading index /tmp/martin-seaweedfs-testrun-2/test_511.idx to memory +I0418 12:01:43 1622 store.go:122] add volume 511 +I0418 12:01:43 1622 volume_layout.go:243] Volume 511 becomes writable +I0418 12:01:43 1622 volume_growth.go:224] Created Volume 511 on topo:DefaultDataCenter:DefaultRack:localhost:8080 +I0418 12:01:43 1622 master_grpc_server.go:158] master send to master@[::1]:45084: url:"localhost:8080" public_url:"localhost:8080" new_vids:511 +I0418 12:01:43 1622 master_grpc_server.go:158] master send to filer@::1:18888: url:"localhost:8080" public_url:"localhost:8080" new_vids:511 +I0418 12:01:43 1622 store.go:118] In dir /tmp/martin-seaweedfs-testrun-2 adds volume:512 collection:test replicaPlacement:000 ttl: +I0418 12:01:43 1622 volume_loading.go:104] loading index /tmp/martin-seaweedfs-testrun-2/test_512.idx to memory +I0418 12:01:43 1622 store.go:122] add volume 512 +I0418 12:01:43 1622 volume_layout.go:243] Volume 512 becomes writable +I0418 12:01:43 1622 master_grpc_server.go:158] master send to master@[::1]:45084: url:"localhost:8080" public_url:"localhost:8080" new_vids:512 +I0418 12:01:43 1622 master_grpc_server.go:158] master send to filer@::1:18888: url:"localhost:8080" public_url:"localhost:8080" new_vids:512 +I0418 12:01:43 1622 volume_growth.go:224] Created Volume 512 on topo:DefaultDataCenter:DefaultRack:localhost:8080 +I0418 12:01:43 1622 node.go:82] topo failed to pick 1 from 0 node candidates +I0418 12:01:43 1622 volume_growth.go:88] create 7 volume, created 2: No enough data node found! +I0418 12:04:30 1622 volume_layout.go:231] Volume 511 becomes unwritable +I0418 12:04:30 1622 volume_layout.go:231] Volume 512 becomes unwritable +E0418 12:04:30 1622 filer_server_handlers_write.go:69] failing to assign a file id: rpc error: code = Unknown desc = No free volumes left! +I0418 12:04:30 1622 filer_server_handlers_write.go:120] fail to allocate volume for /buckets/test/k43731970, collection:test, datacenter: +E0418 12:04:30 1622 filer_server_handlers_write.go:69] failing to assign a file id: rpc error: code = Unknown desc = No free volumes left! +E0418 12:04:30 1622 filer_server_handlers_write.go:69] failing to assign a file id: rpc error: code = Unknown desc = No free volumes left! +E0418 12:04:30 1622 filer_server_handlers_write.go:69] failing to assign a file id: rpc error: code = Unknown desc = No free volumes left! +E0418 12:04:30 1622 filer_server_handlers_write.go:69] failing to assign a file id: rpc error: code = Unknown desc = No free volumes left! +I0418 12:04:30 1622 masterclient.go:88] filer failed to receive from localhost:9333: rpc error: code = Unavailable desc = transport is closing +I0418 12:04:30 1622 master_grpc_server.go:276] - client filer@::1:18888 +``` + +Inserted about 18M docs, then: + +``` +worker-0 @3720000 45475.13 81.80 +worker-1 @3730000 45525.00 81.93 +worker-3 @3720000 45525.76 81.71 +worker-4 @3720000 45527.22 81.71 +Process Process-1: +Traceback (most recent call last): + File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap + self.run() + File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run + self._target(*self._args, **self._kwargs) + File "s3test.py", line 42, in insert_keys + s3.Bucket(bucket).put_object(Key=key, Body=data) + File "/home/martin/.virtualenvs/6f3fee974ba82083325c2f24c912b47b/lib/python3.5/site-packages/boto3/resources/factory.py", line 520, in do_action + response = action(self, *args, **kwargs) + File "/home/martin/.virtualenvs/6f3fee974ba82083325c2f24c912b47b/lib/python3.5/site-packages/boto3/resources/action.py", line 83, in __call__ + response = getattr(parent.meta.client, operation_name)(**params) + File "/home/martin/.virtualenvs/6f3fee974ba82083325c2f24c912b47b/lib/python3.5/site-packages/botocore/client.py", line 316, in _api_call + return self._make_api_call(operation_name, kwargs) + File "/home/martin/.virtualenvs/6f3fee974ba82083325c2f24c912b47b/lib/python3.5/site-packages/botocore/client.py", line 626, in _make_api_call + raise error_class(parsed_response, operation_name) +botocore.exceptions.ClientError: An error occurred (InternalError) when calling the PutObject operation (reached max retries: 4): We encountered an internal error, please try again. + +real 759m30.034s +user 1962m47.487s +sys 105m21.113s +``` + +Sustained 400 S3 puts/s, RAM usage 41% of a 4G machine. 56G on disk. + +> No free volumes left! Failed to allocate bucket for /buckets/test/k163721819 + +### testrun-3 + +* use leveldb, leveldbLarge +* try "auto" volumes +* Status: done. Observed: rapid memory usage increase. + +``` +$ weed server -dir /tmp/martin-seaweedfs-testrun-3 -s3 -volume.max 0 -volume.index=leveldbLarge -filer=false -master.volumeSizeLimitMB 100 +``` + +Observations: memory usage grows rapidly, soon at 15%. + +Note-to-self: [https://github.com/chrislusf/seaweedfs/wiki/Optimization](https://github.com/chrislusf/seaweedfs/wiki/Optimization) + +### testrun-4 + +The default volume size is 30G (and cannot be more at the moment), and RAM +grows very much with the number of volumes. Therefore, keep default volume size +and do not limit number of volumes `-volume.max 0` and do not use in-memory +index (rather leveldb) + +Status: done, 200M object upload via Python script sucessfully in about 6 days, +memory usage was at a moderate 400M (~10% of RAM). Relatively constant +performance at about 400 `PutObject` requests/s (over 5 threads, each thread +was around 80 requests/s; then testing with 4 threads, each thread got to +around 100 requests/s). + +``` +$ weed server -dir /tmp/martin-seaweedfs-testrun-4 -s3 -volume.max 0 -volume.index=leveldb +``` + +The test script command was (40M files per worker, 5 workers). + +``` +$ time python s3test.py -n 40000000 -w 5 2> s3test.4.log +... + +real 8454m33.695s +user 21318m23.094s +sys 1128m32.293s +``` + +The test script adds keys from `k0...k199999999`. + +``` +$ aws --endpoint-url http://localhost:8333 s3 ls s3://test | head -20 +2020-04-19 09:27:13 5000 k0 +2020-04-19 09:27:13 5000 k1 +2020-04-19 09:27:13 5000 k10 +2020-04-19 09:27:15 5000 k100 +2020-04-19 09:27:26 5000 k1000 +2020-04-19 09:29:15 5000 k10000 +2020-04-19 09:47:49 5000 k100000 +2020-04-19 12:54:03 5000 k1000000 +2020-04-20 20:14:10 5000 k10000000 +2020-04-22 07:33:46 5000 k100000000 +2020-04-22 07:33:46 5000 k100000001 +2020-04-22 07:33:46 5000 k100000002 +2020-04-22 07:33:46 5000 k100000003 +2020-04-22 07:33:46 5000 k100000004 +2020-04-22 07:33:46 5000 k100000005 +2020-04-22 07:33:46 5000 k100000006 +2020-04-22 07:33:46 5000 k100000007 +2020-04-22 07:33:46 5000 k100000008 +2020-04-22 07:33:46 5000 k100000009 +2020-04-20 20:14:10 5000 k10000001 +``` + +Glance at stats. + +``` +$ du -hs /tmp/martin-seaweedfs-testrun-4 +596G /tmp/martin-seaweedfs-testrun-4 + +$ find . /tmp/martin-seaweedfs-testrun-4 | wc -l +5104 + +$ ps --pid $(pidof weed) -o pid,tid,class,stat,vsz,rss,comm + PID TID CLS STAT VSZ RSS COMMAND +32194 32194 TS Sl+ 1966964 491644 weed + +$ ls -1 /proc/$(pidof weed)/fd | wc -l +192 + +$ free -m + total used free shared buff/cache available +Mem: 3944 534 324 39 3086 3423 +Swap: 4094 27 4067 +``` + +### Note on restart + +When stopping (CTRL-C) and restarting `weed` it will take about 10 seconds to +get the S3 API server back up, but another minute or two, until seaweedfs +inspects all existing volumes and indices. + +In that gap, requests to S3 will look like internal server errors. + +``` +$ aws --endpoint-url http://localhost:8333 s3 cp s3://test/k100 - +download failed: s3://test/k100 to - An error occurred (500) when calling the +GetObject operation (reached max retries: 4): Internal Server Error +``` + +### Read benchmark + +Reading via command line `aws` client is a bit slow at first sight (3-5s). + +``` +$ time aws --endpoint-url http://localhost:8333 s3 cp s3://test/k123456789 - +ppbhjgzkrrgwagmjsuwhqcwqzmefybeopqz [...] + +real 0m5.839s +user 0m0.898s +sys 0m0.293s +``` + +#### Single process random reads + +* via [s3read.go](https://gist.github.com/miku/6f3fee974ba82083325c2f24c912b47b#file-s3read-go) + +Running 1000 random reads takes 49s. + +#### Concurrent random reads + +* 80000 request with 8 parallel processes: 7m41.973968488s, so about 170 objects/s) +* seen up to 760 keys/s reads for 8 workers +* weed will utilize all cores, so more cpus could result in higher read throughput +* RAM usage can increase (seen up to 20% of 4G RAM), then descrease (GC) back to 5%, depending on query load diff --git a/proposals/2021-04-22_crossref_db.md b/proposals/2021-04-22_crossref_db.md new file mode 100644 index 0000000..bead7a4 --- /dev/null +++ b/proposals/2021-04-22_crossref_db.md @@ -0,0 +1,86 @@ + +status: work-in-progress + +Crossref DOI Metadata in Sandcrawler DB +======================================= + +Proposal is to have a local copy of Crossref API metadata records in +sandcrawler DB, accessible by simple key lookup via postgrest. + +Initial goal is to include these in scholar work "bundles" (along with +fulltext, etc), in particular as part of reference extraction pipeline. Around +late 2020, many additional references became available via Crossref records, +and have not been imported (updated) into fatcat. Reference storage in fatcat +API is a scaling problem we would like to put off, so injecting content in this +way is desirable. + +To start, working with a bulk dump made available by Crossref. In the future, +might persist the daily feed to that we have a continuously up-to-date copy. + +Another application of Crossref-in-bundles is to identify overall scale of +changes since initial Crossref metadata import. + + +## Sandcrawler DB Schema + +The "updated" field in this case refers to the upstream timestamp, not the +sandcrawler database update time. + + CREATE TABLE IF NOT EXISTS crossref ( + doi TEXT NOT NULL CHECK (octet_length(doi) >= 4 AND doi = LOWER(doi)), + indexed TIMESTAMP WITH TIME ZONE NOT NULL, + record JSON NOT NULL, + PRIMARY KEY(doi) + ); + +For postgrest access, may need to also: + + GRANT SELECT ON public.crossref TO web_anon; + +## SQL Backfill Command + +For an example file: + + cat sample.json \ + | jq -rc '[(.DOI | ascii_downcase), .indexed."date-time", (. | tostring)] | @tsv' \ + | psql sandcrawler -c "COPY crossref (doi, indexed, record) FROM STDIN (DELIMITER E'\t');" + +For a full snapshot: + + zcat crossref_public_data_file_2021_01.json.gz \ + | pv -l \ + | jq -rc '[(.DOI | ascii_downcase), .indexed."date-time", (. | tostring)] | @tsv' \ + | psql sandcrawler -c "COPY crossref (doi, indexed, record) FROM STDIN (DELIMITER E'\t');" + +jq is the bottleneck (100% of a single CPU core). + +## Kafka Worker + +Pulls from the fatcat crossref ingest Kafka feed and persists into the crossref +table. + +## SQL Table Disk Utilization + +An example backfill from early 2021, with about 120 million Crossref DOI +records. + +Starting database size (with ingest running): + + Filesystem Size Used Avail Use% Mounted on + /dev/vdb1 1.7T 896G 818G 53% /1 + + Size: 475.14G + +Ingest SQL command took: + + 120M 15:06:08 [2.22k/s] + COPY 120684688 + +After database size: + + Filesystem Size Used Avail Use% Mounted on + /dev/vdb1 1.7T 1.2T 498G 71% /1 + + Size: 794.88G + +So about 320 GByte of disk. diff --git a/proposals/2021-09-09_component_ingest.md b/proposals/2021-09-09_component_ingest.md new file mode 100644 index 0000000..09dee4f --- /dev/null +++ b/proposals/2021-09-09_component_ingest.md @@ -0,0 +1,114 @@ + +File Ingest Mode: 'component' +============================= + +A new ingest type for downloading individual files which are a subset of a +complete work. + +Some publishers now assign DOIs to individual figures, supplements, and other +"components" of an over release or document. + +Initial mimetypes to allow: + +- image/jpeg +- image/tiff +- image/png +- image/gif +- audio/mpeg +- video/mp4 +- video/mpeg +- text/plain +- text/csv +- application/json +- application/xml +- application/pdf +- application/gzip +- application/x-bzip +- application/x-bzip2 +- application/zip +- application/x-rar +- application/x-7z-compressed +- application/x-tar +- application/vnd.ms-powerpoint +- application/vnd.ms-excel +- application/msword +- application/vnd.openxmlformats-officedocument.wordprocessingml.document +- application/vnd.openxmlformats-officedocument.spreadsheetml.sheet + +Intentionally not supporting: + +- text/html + + +## Fatcat Changes + +In the file importer, allow the additional mimetypes for 'component' ingest. + + +## Ingest Changes + +Allow additional terminal mimetypes for 'component' crawls. + + +## Examples + +Hundreds of thousands: <https://fatcat.wiki/release/search?q=type%3Acomponent+in_ia%3Afalse> + +#### ACS Supplement File + +<https://doi.org/10.1021/acscatal.0c02627.s002> + +Redirects directly to .zip in browser. SPN is blocked by cookie check. + +#### Frontiers .docx Supplement + +<https://doi.org/10.3389/fpls.2019.01642.s001> + +Redirects to full article page. There is a pop-up for figshare, seems hard to process. + +#### Figshare Single FIle + +<https://doi.org/10.6084/m9.figshare.13646972.v1> + +As 'component' type in fatcat. + +Redirects to a landing page. Dataset ingest seems more appropriate for this entire domain. + +#### PeerJ supplement file + +<https://doi.org/10.7717/peerj.10257/supp-7> + +PeerJ is hard because it redirects to a single HTML page, which has links to +supplements in the HTML. Perhaps a custom extractor will work. + +#### eLife + +<https://doi.org/10.7554/elife.38407.010> + +The current crawl mechanism makes it seemingly impossible to extract a specific +supplement from the document as a whole. + +#### Zookeys + +<https://doi.org/10.3897/zookeys.895.38576.figure53> + +These are extract-able. + +#### OECD PDF Supplement + +<https://doi.org/10.1787/f08c6324-en> +<https://www.oecd-ilibrary.org/trade/imports-of-services-billions-of-us-dollars_f08c6324-en> + +Has an Excel (.xls) link, great, but then paywall. + +#### Direct File Link + +<https://doi.org/10.1787/888934207500> + +This one is also OECD, but is a simple direct download. + +#### Protein Data Base (PDB) Entry + +<https://doi.org/10.2210/pdb6ls2/pdb> + +Multiple files; dataset/fileset more appropriate for these. diff --git a/proposals/2021-09-13_src_ingest.md b/proposals/2021-09-13_src_ingest.md new file mode 100644 index 0000000..470827a --- /dev/null +++ b/proposals/2021-09-13_src_ingest.md @@ -0,0 +1,53 @@ + +File Ingest Mode: 'src' +======================= + +Ingest type for "source" of works in document form. For example, tarballs of +LaTeX source and figures, as published on arxiv.org and Pubmed Central. + +For now, presumption is that this would be a single file (`file` entity in +fatcat). + +Initial mimetypes to allow: + +- text/x-tex +- application/xml +- application/gzip +- application/x-bzip +- application/x-bzip2 +- application/zip +- application/x-tar +- application/msword +- application/vnd.openxmlformats-officedocument.wordprocessingml.document + + +## Fatcat Changes + +In the file importer, allow the additional mimetypes for 'src' ingest. + +Might keep ingest disabled on the fatcat side, at least initially. Eg, until +there is some scope of "file scope", or other ways of treating 'src' tarballs +separate from PDFs or other fulltext formats. + + +## Ingest Changes + +Allow additional terminal mimetypes for 'src' crawls. + + +## Examples + + arxiv:2109.00954v1 + fatcat:release_akzp2lgqjbcbhpoeoitsj5k5hy + https://arxiv.org/format/2109.00954v1 + https://arxiv.org/e-print/2109.00954v1 + + arxiv:1912.03397v2 + https://arxiv.org/format/1912.03397v2 + https://arxiv.org/e-print/1912.03397v2 + NOT: https://arxiv.org/pdf/1912.03397v2 + + pmcid:PMC3767916 + https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/08/03/PMC3767916.tar.gz + +For PMC, will need to use one of the .csv file lists to get the digit prefixes. diff --git a/proposals/schema_changes.sql b/proposals/schema_changes.sql new file mode 100644 index 0000000..e18d051 --- /dev/null +++ b/proposals/schema_changes.sql @@ -0,0 +1,40 @@ + +-- file_meta: more NOT NULL +CREATE TABLE IF NOT EXISTS file_meta ( + sha1hex TEXT NOT NULL PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + sha256hex TEXT NOT NULL CHECK (octet_length(sha256hex) = 64), + md5hex TEXT NOT NULL CHECK (octet_length(md5hex) = 32), + size_bytes BIGINT NOT NULL, + mimetype TEXT CHECK (octet_length(mimetype) >= 1) +); + +-- CDX: add domain/host columns? +CREATE TABLE IF NOT EXISTS cdx ( + url TEXT NOT NULL CHECK (octet_length(url) >= 1), + datetime TEXT NOT NULL CHECK (octet_length(datetime) = 14), + sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), + cdx_sha1hex TEXT CHECK (octet_length(cdx_sha1hex) = 40), + mimetype TEXT CHECK (octet_length(mimetype) >= 1), + warc_path TEXT CHECK (octet_length(warc_path) >= 1), + warc_csize BIGINT, + warc_offset BIGINT, + row_created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + domain TEXT NOT NULL CHECK (octet_length(domain) >= 1), + host TEXT NOT NULL CHECK (octet_length(host) >= 1), + PRIMARY KEY(url, datetime) +); +CREATE INDEX IF NOT EXISTS cdx_sha1hex_idx ON cdx(sha1hex); +CREATE INDEX IF NOT EXISTS cdx_row_created_idx ON cdx(row_created); + +-- direct fast import with just md5hex; big UPDATE via join with file_meta +CREATE TABLE IF NOT EXISTS shadow ( + shadow_corpus TEXT NOT NULL CHECK (octet_length(shadow_corpus) >= 1), + shadow_id TEXT NOT NULL CHECK (octet_length(shadow_id) >= 1), + sha1hex TEXT CHECK (octet_length(sha1hex) = 40), + md5hex TEXT CHECK (octet_length(md5hex) = 32), + doi TEXT CHECK (octet_length(doi) >= 1), + pmid TEXT CHECK (octet_length(pmid) >= 1), + isbn13 TEXT CHECK (octet_length(isbn13) >= 1), + PRIMARY KEY(shadow_corpus, shadow_id) +); +CREATE INDEX shadow_sha1hex_idx ON shadow(sha1hex); |