aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2018-06-09 00:28:51 -0700
committerBryan Newbold <bnewbold@robocracy.org>2018-06-09 00:28:51 -0700
commitce0c06ca8e694362a3bf4cde175efbe1af6e4962 (patch)
tree94347a1c9d1900bffd79092bc94eb12a6ae6ea31
parent1960f398fb3ae664bec0efaaa6a399a7d01675cb (diff)
downloadfatcat-ce0c06ca8e694362a3bf4cde175efbe1af6e4962.tar.gz
fatcat-ce0c06ca8e694362a3bf4cde175efbe1af6e4962.zip
basic ORCID importer
-rwxr-xr-xpython/client.py27
-rw-r--r--python/fatcat/orcid_importer.py63
-rw-r--r--python/tests/files/0000-0001-8254-7103.json1
-rw-r--r--python/tests/orcid.py15
4 files changed, 101 insertions, 5 deletions
diff --git a/python/client.py b/python/client.py
index 070d046a..14814512 100755
--- a/python/client.py
+++ b/python/client.py
@@ -1,13 +1,19 @@
#!/usr/bin/env python3
+import sys
import argparse
from fatcat.api_client import FatCatApiClient
+from fatcat.orcid_importer import FatcatOrcidImporter
-def import_crossref(args):
+def run_import_crossref(args):
fcc = FatCatApiClient(args.host_url)
fcc.import_crossref_file(args.json_file,
create_containers=args.create_containers)
+def run_import_orcid(args):
+ foi = FatcatOrcidImporter(args.host_url)
+ foi.process_batch(args.json_file, size=args.batch_size)
+
def health(args):
fcc = FatCatApiClient(args.host_url)
print(fcc.health())
@@ -18,23 +24,34 @@ def main():
action='store_true',
help="enable debugging interface")
parser.add_argument('--host-url',
- default="http://localhost:9411",
+ default="http://localhost:9411/v0",
help="connect to this host/port")
subparsers = parser.add_subparsers()
- sub_import_crossref = subparsers.add_parser('import-crossref',
- aliases=['ic'])
- sub_import_crossref.set_defaults(func=import_crossref)
+ sub_import_crossref = subparsers.add_parser('import-crossref')
+ sub_import_crossref.set_defaults(func=run_import_crossref)
sub_import_crossref.add_argument('json_file',
help="crossref JSON file to import from")
sub_import_crossref.add_argument('--create-containers',
action='store_true',
help="if true, create containers based on ISSN")
+ sub_import_orcid = subparsers.add_parser('import-orcid')
+ sub_import_orcid.set_defaults(func=run_import_orcid)
+ sub_import_orcid.add_argument('json_file',
+ help="orcid JSON file to import from (or stdin)",
+ default=sys.stdin, type=argparse.FileType('r'))
+ sub_import_orcid.add_argument('--batch-size',
+ help="size of batch to send",
+ default=50, type=int)
+
sub_health = subparsers.add_parser('health')
sub_health.set_defaults(func=health)
args = parser.parse_args()
+ if not args.__dict__.get("func"):
+ print("tell me what to do!")
+ sys.exit(-1)
args.func(args)
if __name__ == '__main__':
diff --git a/python/fatcat/orcid_importer.py b/python/fatcat/orcid_importer.py
new file mode 100644
index 00000000..063390b8
--- /dev/null
+++ b/python/fatcat/orcid_importer.py
@@ -0,0 +1,63 @@
+
+import sys
+import json
+import itertools
+import fatcat_client
+
+def value_or_none(e):
+ if type(e) == dict:
+ e = e.get('value')
+ if type(e) == str and len(e) == 0:
+ e = None
+ return e
+
+# from: https://docs.python.org/3/library/itertools.html
+def grouper(iterable, n, fillvalue=None):
+ "Collect data into fixed-length chunks or blocks"
+ args = [iter(iterable)] * n
+ return itertools.zip_longest(*args, fillvalue=fillvalue)
+
+class FatcatOrcidImporter:
+
+ def __init__(self, host_url):
+ conf = fatcat_client.Configuration()
+ conf.host = host_url
+ self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf))
+
+ def parse_orcid_dict(self, obj):
+ """
+ obj is a python dict (parsed from json).
+ returns a CreatorEntity
+ """
+ name = obj['person']['name']
+ extra = None
+ given = value_or_none(name.get('given-name'))
+ sur = value_or_none(name.get('family-name'))
+ display = value_or_none(name.get('credit-name'))
+ if display is None:
+ # TODO: sorry human beings
+ display = "{} {}".format(given, sur)
+ ce = fatcat_client.CreatorEntity(
+ orcid=obj['orcid-identifier']['path'],
+ given_name=given,
+ surname=sur,
+ display_name=display,
+ extra=extra)
+ return ce
+
+ def process_line(self, line):
+ obj = json.loads(line)
+ ce = self.parse_orcid_dict(obj)
+ self.api.create_creator(ce)
+
+ def process_source(self, source):
+ for line in source:
+ self.process_line(line)
+
+ def process_batch(self, source, size=50):
+ """Reads and processes in batches (not API-call-per-line)"""
+ for lines in grouper(source, size):
+ objects = [self.parse_orcid_dict(json.loads(l))
+ for l in lines if l != None]
+ self.api.create_creator_batch(objects)
+ break
diff --git a/python/tests/files/0000-0001-8254-7103.json b/python/tests/files/0000-0001-8254-7103.json
new file mode 100644
index 00000000..2bf437d0
--- /dev/null
+++ b/python/tests/files/0000-0001-8254-7103.json
@@ -0,0 +1 @@
+{"orcid-identifier":{"uri":"http://orcid.org/0000-0001-8254-7103","path":"0000-0001-8254-7103","host":"orcid.org"},"preferences":{"locale":"en"},"history":{"creation-method":"Member-referred","completion-date":null,"submission-date":{"value":1407501041999},"last-modified-date":{"value":1465949566770},"claimed":true,"source":null,"deactivation-date":null,"verified-email":true,"verified-primary-email":true},"person":{"last-modified-date":null,"name":{"created-date":{"value":1460755375159},"last-modified-date":{"value":1460755375159},"given-names":{"value":"Man-Hui"},"family-name":{"value":"Li"},"credit-name":null,"source":null,"visibility":"public","path":"0000-0001-8254-7103"},"other-names":{"last-modified-date":null,"other-name":null,"path":"/0000-0001-8254-7103/other-names"},"biography":{"created-date":{"value":1460755375161},"last-modified-date":{"value":1460755375161},"content":null,"visibility":"public","path":"/0000-0001-8254-7103/biography"},"researcher-urls":{"last-modified-date":null,"researcher-url":null,"path":"/0000-0001-8254-7103/researcher-urls"},"emails":{"last-modified-date":null,"email":null,"path":"/0000-0001-8254-7103/email"},"addresses":{"last-modified-date":null,"address":null,"path":"/0000-0001-8254-7103/address"},"keywords":{"last-modified-date":null,"keyword":null,"path":"/0000-0001-8254-7103/keywords"},"external-identifiers":{"last-modified-date":null,"external-identifier":null,"path":"/0000-0001-8254-7103/external-identifiers"},"path":"/0000-0001-8254-7103/person"},"activities-summary":{"last-modified-date":null,"educations":{"last-modified-date":null,"education-summary":null,"path":"/0000-0001-8254-7103/educations"},"employments":{"last-modified-date":null,"employment-summary":null,"path":"/0000-0001-8254-7103/employments"},"fundings":{"last-modified-date":null,"group":null,"path":"/0000-0001-8254-7103/fundings"},"peer-reviews":{"last-modified-date":null,"group":null,"path":"/0000-0001-8254-7103/peer-reviews"},"works":{"last-modified-date":null,"group":null,"path":"/0000-0001-8254-7103/works"},"path":"/0000-0001-8254-7103/activities"},"path":"/0000-0001-8254-7103"}
diff --git a/python/tests/orcid.py b/python/tests/orcid.py
new file mode 100644
index 00000000..86a23603
--- /dev/null
+++ b/python/tests/orcid.py
@@ -0,0 +1,15 @@
+
+import pytest
+from fatcat.orcid_importer import FatcatOrcidImporter
+
+@pytest.fixture(scope="function")
+def orcid_importer():
+ yield FatcatOrcidImporter("http://localhost:9411/v0")
+
+def test_orcid_importer_batch(orcid_importer):
+ with open('tests/files/0000-0001-8254-7103.json', 'r') as f:
+ orcid_importer.process_batch(f)
+
+def test_orcid_importer(orcid_importer):
+ with open('tests/files/0000-0001-8254-7103.json', 'r') as f:
+ orcid_importer.process_source(f)