From d33a37eab50e95ceabadf7bbc20088ad62669564 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 23 Dec 2020 11:07:41 -0800 Subject: DOAJ import notes, and SQL/stats update --- notes/bulk_edits/2020-12-14_doaj.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'notes/bulk_edits') diff --git a/notes/bulk_edits/2020-12-14_doaj.md b/notes/bulk_edits/2020-12-14_doaj.md index 64a80fda..5e897183 100644 --- a/notes/bulk_edits/2020-12-14_doaj.md +++ b/notes/bulk_edits/2020-12-14_doaj.md @@ -122,3 +122,18 @@ ahead with the full import; note that other ingest is happening in parallel zcat /srv/fatcat/datasets/doaj_article_data_2020-11-13_all.json.gz | shuf | pv -l | parallel -j12 --round-robin --pipe ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt - # started 2020-12-17 22:01 (Pacific) + + => 5.45M 52:38:45 [28.8 /s] + => Counter({'total': 1366458, 'exists': 1020295, 'insert': 200249, 'exists-fuzzy': 144334, 'skip': 1563, 'skip-title': 1563, 'skip-doaj-id-mismatch': 17, 'update': 0}) + +As total estimates: + +- total: 5,465,832 +- exists: 4,081,180 +- exists-fuzzy: 577,336 +- insert: 800,996 + +Ending database size: Size: 684.08G + +(note that regular imports were running during same period) + -- cgit v1.2.3