summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2018-06-09 00:59:33 -0700
committerBryan Newbold <bnewbold@robocracy.org>2018-06-09 00:59:33 -0700
commit0f6354ffbdf7115f8a6d7e4d3ea700a44fe567ed (patch)
treed6fb3d094dd64f51b0d4723a0c30112b89b7c3d7
parentce0c06ca8e694362a3bf4cde175efbe1af6e4962 (diff)
downloadfatcat-0f6354ffbdf7115f8a6d7e4d3ea700a44fe567ed.tar.gz
fatcat-0f6354ffbdf7115f8a6d7e4d3ea700a44fe567ed.zip
fixes to orcid importer for larger batches
-rw-r--r--python/README_import.md31
-rw-r--r--python/fatcat/orcid_importer.py8
2 files changed, 37 insertions, 2 deletions
diff --git a/python/README_import.md b/python/README_import.md
new file mode 100644
index 00000000..11cb0fd8
--- /dev/null
+++ b/python/README_import.md
@@ -0,0 +1,31 @@
+
+## ORCID
+
+Does not work:
+
+ ./client.py import-orcid /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3/0000-0001-5115-8623.json
+
+Instead:
+
+ cat /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3/0000-0001-5115-8623.json | jq -c . | ./client.py import-orcid -
+
+Or for many files:
+
+ find /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3 -iname '*.json' | parallel --bar jq -c . {} | rg '"person":' | ./client.py import-orcid -
+
+
+for ~9k files:
+
+ (python-B2RYrks8) bnewbold@orithena$ time parallel --pipepart -j4 -a /data/orcid/partial/public_profiles_API-2.0_2017_10_json/all.json ./client.py import-orcid -
+ real 0m15.294s
+ user 0m28.112s
+ sys 0m2.408s
+
+ => 636/second
+
+ (python-B2RYrks8) bnewbold@orithena$ time ./client.py import-orcid /data/orcid/partial/public_profiles_API-2.0_2017_10_json/all.json
+ real 0m47.268s
+ user 0m2.616s
+ sys 0m0.104s
+
+ => 203/second
diff --git a/python/fatcat/orcid_importer.py b/python/fatcat/orcid_importer.py
index 063390b8..02681b0a 100644
--- a/python/fatcat/orcid_importer.py
+++ b/python/fatcat/orcid_importer.py
@@ -30,6 +30,8 @@ class FatcatOrcidImporter:
returns a CreatorEntity
"""
name = obj['person']['name']
+ if name is None:
+ return None
extra = None
given = value_or_none(name.get('given-name'))
sur = value_or_none(name.get('family-name'))
@@ -48,7 +50,8 @@ class FatcatOrcidImporter:
def process_line(self, line):
obj = json.loads(line)
ce = self.parse_orcid_dict(obj)
- self.api.create_creator(ce)
+ if ce is not None:
+ self.api.create_creator(ce)
def process_source(self, source):
for line in source:
@@ -59,5 +62,6 @@ class FatcatOrcidImporter:
for lines in grouper(source, size):
objects = [self.parse_orcid_dict(json.loads(l))
for l in lines if l != None]
+ objects = [o for o in objects if o != None]
self.api.create_creator_batch(objects)
- break
+ print("inserted {}".format(len(objects)))