From ce0c06ca8e694362a3bf4cde175efbe1af6e4962 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sat, 9 Jun 2018 00:28:51 -0700 Subject: basic ORCID importer --- python/client.py | 27 ++++++++++--- python/fatcat/orcid_importer.py | 63 +++++++++++++++++++++++++++++ python/tests/files/0000-0001-8254-7103.json | 1 + python/tests/orcid.py | 15 +++++++ 4 files changed, 101 insertions(+), 5 deletions(-) create mode 100644 python/fatcat/orcid_importer.py create mode 100644 python/tests/files/0000-0001-8254-7103.json create mode 100644 python/tests/orcid.py diff --git a/python/client.py b/python/client.py index 070d046a..14814512 100755 --- a/python/client.py +++ b/python/client.py @@ -1,13 +1,19 @@ #!/usr/bin/env python3 +import sys import argparse from fatcat.api_client import FatCatApiClient +from fatcat.orcid_importer import FatcatOrcidImporter -def import_crossref(args): +def run_import_crossref(args): fcc = FatCatApiClient(args.host_url) fcc.import_crossref_file(args.json_file, create_containers=args.create_containers) +def run_import_orcid(args): + foi = FatcatOrcidImporter(args.host_url) + foi.process_batch(args.json_file, size=args.batch_size) + def health(args): fcc = FatCatApiClient(args.host_url) print(fcc.health()) @@ -18,23 +24,34 @@ def main(): action='store_true', help="enable debugging interface") parser.add_argument('--host-url', - default="http://localhost:9411", + default="http://localhost:9411/v0", help="connect to this host/port") subparsers = parser.add_subparsers() - sub_import_crossref = subparsers.add_parser('import-crossref', - aliases=['ic']) - sub_import_crossref.set_defaults(func=import_crossref) + sub_import_crossref = subparsers.add_parser('import-crossref') + sub_import_crossref.set_defaults(func=run_import_crossref) sub_import_crossref.add_argument('json_file', help="crossref JSON file to import from") sub_import_crossref.add_argument('--create-containers', action='store_true', help="if true, create containers based on ISSN") + sub_import_orcid = subparsers.add_parser('import-orcid') + sub_import_orcid.set_defaults(func=run_import_orcid) + sub_import_orcid.add_argument('json_file', + help="orcid JSON file to import from (or stdin)", + default=sys.stdin, type=argparse.FileType('r')) + sub_import_orcid.add_argument('--batch-size', + help="size of batch to send", + default=50, type=int) + sub_health = subparsers.add_parser('health') sub_health.set_defaults(func=health) args = parser.parse_args() + if not args.__dict__.get("func"): + print("tell me what to do!") + sys.exit(-1) args.func(args) if __name__ == '__main__': diff --git a/python/fatcat/orcid_importer.py b/python/fatcat/orcid_importer.py new file mode 100644 index 00000000..063390b8 --- /dev/null +++ b/python/fatcat/orcid_importer.py @@ -0,0 +1,63 @@ + +import sys +import json +import itertools +import fatcat_client + +def value_or_none(e): + if type(e) == dict: + e = e.get('value') + if type(e) == str and len(e) == 0: + e = None + return e + +# from: https://docs.python.org/3/library/itertools.html +def grouper(iterable, n, fillvalue=None): + "Collect data into fixed-length chunks or blocks" + args = [iter(iterable)] * n + return itertools.zip_longest(*args, fillvalue=fillvalue) + +class FatcatOrcidImporter: + + def __init__(self, host_url): + conf = fatcat_client.Configuration() + conf.host = host_url + self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf)) + + def parse_orcid_dict(self, obj): + """ + obj is a python dict (parsed from json). + returns a CreatorEntity + """ + name = obj['person']['name'] + extra = None + given = value_or_none(name.get('given-name')) + sur = value_or_none(name.get('family-name')) + display = value_or_none(name.get('credit-name')) + if display is None: + # TODO: sorry human beings + display = "{} {}".format(given, sur) + ce = fatcat_client.CreatorEntity( + orcid=obj['orcid-identifier']['path'], + given_name=given, + surname=sur, + display_name=display, + extra=extra) + return ce + + def process_line(self, line): + obj = json.loads(line) + ce = self.parse_orcid_dict(obj) + self.api.create_creator(ce) + + def process_source(self, source): + for line in source: + self.process_line(line) + + def process_batch(self, source, size=50): + """Reads and processes in batches (not API-call-per-line)""" + for lines in grouper(source, size): + objects = [self.parse_orcid_dict(json.loads(l)) + for l in lines if l != None] + self.api.create_creator_batch(objects) + break diff --git a/python/tests/files/0000-0001-8254-7103.json b/python/tests/files/0000-0001-8254-7103.json new file mode 100644 index 00000000..2bf437d0 --- /dev/null +++ b/python/tests/files/0000-0001-8254-7103.json @@ -0,0 +1 @@ +{"orcid-identifier":{"uri":"http://orcid.org/0000-0001-8254-7103","path":"0000-0001-8254-7103","host":"orcid.org"},"preferences":{"locale":"en"},"history":{"creation-method":"Member-referred","completion-date":null,"submission-date":{"value":1407501041999},"last-modified-date":{"value":1465949566770},"claimed":true,"source":null,"deactivation-date":null,"verified-email":true,"verified-primary-email":true},"person":{"last-modified-date":null,"name":{"created-date":{"value":1460755375159},"last-modified-date":{"value":1460755375159},"given-names":{"value":"Man-Hui"},"family-name":{"value":"Li"},"credit-name":null,"source":null,"visibility":"public","path":"0000-0001-8254-7103"},"other-names":{"last-modified-date":null,"other-name":null,"path":"/0000-0001-8254-7103/other-names"},"biography":{"created-date":{"value":1460755375161},"last-modified-date":{"value":1460755375161},"content":null,"visibility":"public","path":"/0000-0001-8254-7103/biography"},"researcher-urls":{"last-modified-date":null,"researcher-url":null,"path":"/0000-0001-8254-7103/researcher-urls"},"emails":{"last-modified-date":null,"email":null,"path":"/0000-0001-8254-7103/email"},"addresses":{"last-modified-date":null,"address":null,"path":"/0000-0001-8254-7103/address"},"keywords":{"last-modified-date":null,"keyword":null,"path":"/0000-0001-8254-7103/keywords"},"external-identifiers":{"last-modified-date":null,"external-identifier":null,"path":"/0000-0001-8254-7103/external-identifiers"},"path":"/0000-0001-8254-7103/person"},"activities-summary":{"last-modified-date":null,"educations":{"last-modified-date":null,"education-summary":null,"path":"/0000-0001-8254-7103/educations"},"employments":{"last-modified-date":null,"employment-summary":null,"path":"/0000-0001-8254-7103/employments"},"fundings":{"last-modified-date":null,"group":null,"path":"/0000-0001-8254-7103/fundings"},"peer-reviews":{"last-modified-date":null,"group":null,"path":"/0000-0001-8254-7103/peer-reviews"},"works":{"last-modified-date":null,"group":null,"path":"/0000-0001-8254-7103/works"},"path":"/0000-0001-8254-7103/activities"},"path":"/0000-0001-8254-7103"} diff --git a/python/tests/orcid.py b/python/tests/orcid.py new file mode 100644 index 00000000..86a23603 --- /dev/null +++ b/python/tests/orcid.py @@ -0,0 +1,15 @@ + +import pytest +from fatcat.orcid_importer import FatcatOrcidImporter + +@pytest.fixture(scope="function") +def orcid_importer(): + yield FatcatOrcidImporter("http://localhost:9411/v0") + +def test_orcid_importer_batch(orcid_importer): + with open('tests/files/0000-0001-8254-7103.json', 'r') as f: + orcid_importer.process_batch(f) + +def test_orcid_importer(orcid_importer): + with open('tests/files/0000-0001-8254-7103.json', 'r') as f: + orcid_importer.process_source(f) -- cgit v1.2.3