From 0f6354ffbdf7115f8a6d7e4d3ea700a44fe567ed Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sat, 9 Jun 2018 00:59:33 -0700 Subject: fixes to orcid importer for larger batches --- python/README_import.md | 31 +++++++++++++++++++++++++++++++ python/fatcat/orcid_importer.py | 8 ++++++-- 2 files changed, 37 insertions(+), 2 deletions(-) create mode 100644 python/README_import.md (limited to 'python') diff --git a/python/README_import.md b/python/README_import.md new file mode 100644 index 00000000..11cb0fd8 --- /dev/null +++ b/python/README_import.md @@ -0,0 +1,31 @@ + +## ORCID + +Does not work: + + ./client.py import-orcid /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3/0000-0001-5115-8623.json + +Instead: + + cat /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3/0000-0001-5115-8623.json | jq -c . | ./client.py import-orcid - + +Or for many files: + + find /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3 -iname '*.json' | parallel --bar jq -c . {} | rg '"person":' | ./client.py import-orcid - + + +for ~9k files: + + (python-B2RYrks8) bnewbold@orithena$ time parallel --pipepart -j4 -a /data/orcid/partial/public_profiles_API-2.0_2017_10_json/all.json ./client.py import-orcid - + real 0m15.294s + user 0m28.112s + sys 0m2.408s + + => 636/second + + (python-B2RYrks8) bnewbold@orithena$ time ./client.py import-orcid /data/orcid/partial/public_profiles_API-2.0_2017_10_json/all.json + real 0m47.268s + user 0m2.616s + sys 0m0.104s + + => 203/second diff --git a/python/fatcat/orcid_importer.py b/python/fatcat/orcid_importer.py index 063390b8..02681b0a 100644 --- a/python/fatcat/orcid_importer.py +++ b/python/fatcat/orcid_importer.py @@ -30,6 +30,8 @@ class FatcatOrcidImporter: returns a CreatorEntity """ name = obj['person']['name'] + if name is None: + return None extra = None given = value_or_none(name.get('given-name')) sur = value_or_none(name.get('family-name')) @@ -48,7 +50,8 @@ class FatcatOrcidImporter: def process_line(self, line): obj = json.loads(line) ce = self.parse_orcid_dict(obj) - self.api.create_creator(ce) + if ce is not None: + self.api.create_creator(ce) def process_source(self, source): for line in source: @@ -59,5 +62,6 @@ class FatcatOrcidImporter: for lines in grouper(source, size): objects = [self.parse_orcid_dict(json.loads(l)) for l in lines if l != None] + objects = [o for o in objects if o != None] self.api.create_creator_batch(objects) - break + print("inserted {}".format(len(objects))) -- cgit v1.2.3