diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2018-06-09 00:59:33 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2018-06-09 00:59:33 -0700 |
commit | 0f6354ffbdf7115f8a6d7e4d3ea700a44fe567ed (patch) | |
tree | d6fb3d094dd64f51b0d4723a0c30112b89b7c3d7 | |
parent | ce0c06ca8e694362a3bf4cde175efbe1af6e4962 (diff) | |
download | fatcat-0f6354ffbdf7115f8a6d7e4d3ea700a44fe567ed.tar.gz fatcat-0f6354ffbdf7115f8a6d7e4d3ea700a44fe567ed.zip |
fixes to orcid importer for larger batches
-rw-r--r-- | python/README_import.md | 31 | ||||
-rw-r--r-- | python/fatcat/orcid_importer.py | 8 |
2 files changed, 37 insertions, 2 deletions
diff --git a/python/README_import.md b/python/README_import.md new file mode 100644 index 00000000..11cb0fd8 --- /dev/null +++ b/python/README_import.md @@ -0,0 +1,31 @@ + +## ORCID + +Does not work: + + ./client.py import-orcid /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3/0000-0001-5115-8623.json + +Instead: + + cat /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3/0000-0001-5115-8623.json | jq -c . | ./client.py import-orcid - + +Or for many files: + + find /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3 -iname '*.json' | parallel --bar jq -c . {} | rg '"person":' | ./client.py import-orcid - + + +for ~9k files: + + (python-B2RYrks8) bnewbold@orithena$ time parallel --pipepart -j4 -a /data/orcid/partial/public_profiles_API-2.0_2017_10_json/all.json ./client.py import-orcid - + real 0m15.294s + user 0m28.112s + sys 0m2.408s + + => 636/second + + (python-B2RYrks8) bnewbold@orithena$ time ./client.py import-orcid /data/orcid/partial/public_profiles_API-2.0_2017_10_json/all.json + real 0m47.268s + user 0m2.616s + sys 0m0.104s + + => 203/second diff --git a/python/fatcat/orcid_importer.py b/python/fatcat/orcid_importer.py index 063390b8..02681b0a 100644 --- a/python/fatcat/orcid_importer.py +++ b/python/fatcat/orcid_importer.py @@ -30,6 +30,8 @@ class FatcatOrcidImporter: returns a CreatorEntity """ name = obj['person']['name'] + if name is None: + return None extra = None given = value_or_none(name.get('given-name')) sur = value_or_none(name.get('family-name')) @@ -48,7 +50,8 @@ class FatcatOrcidImporter: def process_line(self, line): obj = json.loads(line) ce = self.parse_orcid_dict(obj) - self.api.create_creator(ce) + if ce is not None: + self.api.create_creator(ce) def process_source(self, source): for line in source: @@ -59,5 +62,6 @@ class FatcatOrcidImporter: for lines in grouper(source, size): objects = [self.parse_orcid_dict(json.loads(l)) for l in lines if l != None] + objects = [o for o in objects if o != None] self.api.create_creator_batch(objects) - break + print("inserted {}".format(len(objects))) |