From f53ada2addef33a0096af079281ad81143339136 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 1 Jul 2020 16:34:37 -0700 Subject: JALC bulk edit notes from 2020-03-23 --- notes/bulk_edits/2020-03-23_jalc.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 notes/bulk_edits/2020-03-23_jalc.md (limited to 'notes') diff --git a/notes/bulk_edits/2020-03-23_jalc.md b/notes/bulk_edits/2020-03-23_jalc.md new file mode 100644 index 00000000..d63c3759 --- /dev/null +++ b/notes/bulk_edits/2020-03-23_jalc.md @@ -0,0 +1,23 @@ + +2019-10-01 JaLC metadata snapshot: + +Extracted .rdf file instead of piping it through zcat. + +Use correct bot: + + export FATCAT_AUTH_WORKER_JALC=blah + +Start small; do a random bunch (10k) single-threaded to pre-create containers: + + head -n100 /srv/fatcat/datasets/JALC-LOD-20191001.rdf | ./fatcat_import.py --batch-size 100 jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt + shuf -n100 /srv/fatcat/datasets/JALC-LOD-20191001.rdf | ./fatcat_import.py --batch-size 100 jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt + shuf -n10000 /srv/fatcat/datasets/JALC-LOD-20191001.rdf | ./fatcat_import.py --batch-size 100 jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt + +Seemed like lots of individual containers getting added after repeating, so +just going to import single-threaded to avoid duplicate container creation: + + cat /srv/fatcat/datasets/JALC-LOD-20191001.rdf | ./fatcat_import.py --batch-size 100 jalc - /srv/fatcat/datasets/ISSN-to-ISSN-L.txt + => Counter({'total': 8419745, 'exists': 6480683, 'insert': 1934082, 'skip': 4980, 'inserted.container': 134, 'update': 0}) + +Had a bit fewer than 4,568,120 "doi_registrar:jalc" releases before this +import, 6,502,202 after (based on `doi_registrar:jalc` query). -- cgit v1.2.3