From 186c03c2b9e1b08e4298383fc6593d1b2c3a5dd8 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 25 Sep 2018 15:44:39 -0700 Subject: start organizing notes into subdirectories --- notes/bootstrap/old_imports.txt | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 notes/bootstrap/old_imports.txt (limited to 'notes/bootstrap/old_imports.txt') diff --git a/notes/bootstrap/old_imports.txt b/notes/bootstrap/old_imports.txt new file mode 100644 index 00000000..1233d4a8 --- /dev/null +++ b/notes/bootstrap/old_imports.txt @@ -0,0 +1,20 @@ + +## ORCID + +Directly from compressed tarball; takes about 2 hours in production: + + tar xf /srv/datasets/public_profiles_API-2.0_2017_10_json.tar.gz -O | jq -c . | grep '"person":' | time parallel -j12 --pipe --round-robin ./fatcat_import.py import-orcid - + +After tuning database, `jq` CPU seems to be bottleneck, so, from pre-extracted +tarball: + + tar xf /srv/datasets/public_profiles_API-2.0_2017_10_json.tar.gz -O | jq -c . | rg '"person":' > /srv/datasets/public_profiles_1_2_json.all.json + time parallel --bar --pipepart -j8 -a /srv/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid - + +Does not work: + + ./fatcat_import.py import-orcid /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3/0000-0001-5115-8623.json + +Instead: + + cat /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3/0000-0001-5115-8623.json | jq -c . | ./fatcat_import.py import-orcid - -- cgit v1.2.3