From 51d754c3fd0cbabb2e81195e2cf70384ed36dad8 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 22 Mar 2022 13:18:21 -0700 Subject: document recent bulk metadata edits/imports --- extra/bulk_edits/2022-03-08_doaj.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 extra/bulk_edits/2022-03-08_doaj.md (limited to 'extra/bulk_edits/2022-03-08_doaj.md') diff --git a/extra/bulk_edits/2022-03-08_doaj.md b/extra/bulk_edits/2022-03-08_doaj.md new file mode 100644 index 00000000..fc6438d5 --- /dev/null +++ b/extra/bulk_edits/2022-03-08_doaj.md @@ -0,0 +1,23 @@ + +Simple periodic update of DOAJ article-level metadata. + + cat doaj_article_data_*/article_batch*.json | jq .[] -c | pv -l | gzip > doaj_article_data_2021-05-25_all.json.gz + => 6.1M 0:18:45 [5.42k/s] + => 7.26M 0:30:45 [3.94k/s] + + export FATCAT_AUTH_WORKER_DOAJ=... + cat /srv/fatcat/tasks/doaj_article_data_2022-03-07_sample_10k.json | ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt - + # Counter({'total': 10000, 'exists': 8827, 'exists-fuzzy': 944, 'insert': 219, 'skip': 8, 'skip-title': 8, 'skip-doaj-id-mismatch': 2, 'update': 0}) + + zcat /srv/fatcat/tasks/doaj_article_data_2022-03-07_all.json.gz | shuf | pv -l | parallel -j12 --round-robin --pipe ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt - + +The above seemed to use too much CPU, and caused a brief outage. Very high CPU +use for just the python import processes, for whatever reason. Turned down +parallelism and trying again: + + zcat /srv/fatcat/tasks/doaj_article_data_2022-03-07_all.json.gz | pv -l | parallel -j6 --round-robin --pipe ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt - + # multiple counts of: + # Counter({'total': 1196313, 'exists': 1055412, 'exists-fuzzy': 111490, 'insert': 27835, 'skip': 1280, 'skip-title': 1280, 'skip-doaj-id-mismatch': 296, 'update': 0}) + # estimated only 167,010 new entities + +Then did a follow-up sandcrawler ingest, see notes in that repository. -- cgit v1.2.3