diff options
Diffstat (limited to 'python')
| -rw-r--r-- | python/README_import.md | 31 | ||||
| -rw-r--r-- | python/fatcat/orcid_importer.py | 8 | 
2 files changed, 37 insertions, 2 deletions
diff --git a/python/README_import.md b/python/README_import.md new file mode 100644 index 00000000..11cb0fd8 --- /dev/null +++ b/python/README_import.md @@ -0,0 +1,31 @@ + +## ORCID + +Does not work: + +    ./client.py import-orcid /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3/0000-0001-5115-8623.json + +Instead: + +    cat /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3/0000-0001-5115-8623.json | jq -c . | ./client.py import-orcid - + +Or for many files: + +    find /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3 -iname '*.json' | parallel --bar jq -c . {} | rg '"person":' | ./client.py import-orcid - + + +for ~9k files: + +    (python-B2RYrks8) bnewbold@orithena$ time parallel --pipepart -j4 -a /data/orcid/partial/public_profiles_API-2.0_2017_10_json/all.json ./client.py import-orcid - +    real    0m15.294s +    user    0m28.112s +    sys     0m2.408s + +    => 636/second + +    (python-B2RYrks8) bnewbold@orithena$ time ./client.py import-orcid /data/orcid/partial/public_profiles_API-2.0_2017_10_json/all.json +    real    0m47.268s +    user    0m2.616s +    sys     0m0.104s + +    => 203/second diff --git a/python/fatcat/orcid_importer.py b/python/fatcat/orcid_importer.py index 063390b8..02681b0a 100644 --- a/python/fatcat/orcid_importer.py +++ b/python/fatcat/orcid_importer.py @@ -30,6 +30,8 @@ class FatcatOrcidImporter:          returns a CreatorEntity          """          name = obj['person']['name'] +        if name is None: +            return None          extra = None          given = value_or_none(name.get('given-name'))          sur = value_or_none(name.get('family-name')) @@ -48,7 +50,8 @@ class FatcatOrcidImporter:      def process_line(self, line):          obj = json.loads(line)          ce = self.parse_orcid_dict(obj) -        self.api.create_creator(ce) +        if ce is not None: +            self.api.create_creator(ce)      def process_source(self, source):          for line in source: @@ -59,5 +62,6 @@ class FatcatOrcidImporter:          for lines in grouper(source, size):              objects = [self.parse_orcid_dict(json.loads(l))                         for l in lines if l != None] +            objects = [o for o in objects if o != None]              self.api.create_creator_batch(objects) -            break +            print("inserted {}".format(len(objects)))  | 
