From 87a9bb4ed81b503f03e6e77d6b082249523e67d4 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 2 Jun 2021 00:26:51 -0700 Subject: crossref DB proposal, and include in SQL schema --- proposals/2021-04-22_crossref_db.md | 86 +++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 proposals/2021-04-22_crossref_db.md (limited to 'proposals') diff --git a/proposals/2021-04-22_crossref_db.md b/proposals/2021-04-22_crossref_db.md new file mode 100644 index 0000000..bead7a4 --- /dev/null +++ b/proposals/2021-04-22_crossref_db.md @@ -0,0 +1,86 @@ + +status: work-in-progress + +Crossref DOI Metadata in Sandcrawler DB +======================================= + +Proposal is to have a local copy of Crossref API metadata records in +sandcrawler DB, accessible by simple key lookup via postgrest. + +Initial goal is to include these in scholar work "bundles" (along with +fulltext, etc), in particular as part of reference extraction pipeline. Around +late 2020, many additional references became available via Crossref records, +and have not been imported (updated) into fatcat. Reference storage in fatcat +API is a scaling problem we would like to put off, so injecting content in this +way is desirable. + +To start, working with a bulk dump made available by Crossref. In the future, +might persist the daily feed to that we have a continuously up-to-date copy. + +Another application of Crossref-in-bundles is to identify overall scale of +changes since initial Crossref metadata import. + + +## Sandcrawler DB Schema + +The "updated" field in this case refers to the upstream timestamp, not the +sandcrawler database update time. + + CREATE TABLE IF NOT EXISTS crossref ( + doi TEXT NOT NULL CHECK (octet_length(doi) >= 4 AND doi = LOWER(doi)), + indexed TIMESTAMP WITH TIME ZONE NOT NULL, + record JSON NOT NULL, + PRIMARY KEY(doi) + ); + +For postgrest access, may need to also: + + GRANT SELECT ON public.crossref TO web_anon; + +## SQL Backfill Command + +For an example file: + + cat sample.json \ + | jq -rc '[(.DOI | ascii_downcase), .indexed."date-time", (. | tostring)] | @tsv' \ + | psql sandcrawler -c "COPY crossref (doi, indexed, record) FROM STDIN (DELIMITER E'\t');" + +For a full snapshot: + + zcat crossref_public_data_file_2021_01.json.gz \ + | pv -l \ + | jq -rc '[(.DOI | ascii_downcase), .indexed."date-time", (. | tostring)] | @tsv' \ + | psql sandcrawler -c "COPY crossref (doi, indexed, record) FROM STDIN (DELIMITER E'\t');" + +jq is the bottleneck (100% of a single CPU core). + +## Kafka Worker + +Pulls from the fatcat crossref ingest Kafka feed and persists into the crossref +table. + +## SQL Table Disk Utilization + +An example backfill from early 2021, with about 120 million Crossref DOI +records. + +Starting database size (with ingest running): + + Filesystem Size Used Avail Use% Mounted on + /dev/vdb1 1.7T 896G 818G 53% /1 + + Size: 475.14G + +Ingest SQL command took: + + 120M 15:06:08 [2.22k/s] + COPY 120684688 + +After database size: + + Filesystem Size Used Avail Use% Mounted on + /dev/vdb1 1.7T 1.2T 498G 71% /1 + + Size: 794.88G + +So about 320 GByte of disk. -- cgit v1.2.3