summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2018-06-20 09:37:37 -0700
committerBryan Newbold <bnewbold@robocracy.org>2018-06-20 09:37:37 -0700
commitbde5c8f14e13afe4d54e9bfafd8bda8b0f33f804 (patch)
tree67d7039b1621bebdafd89539602c2b5d05332501
parent698399c49edcefe33c012856b604985925969a77 (diff)
downloadfatcat-bde5c8f14e13afe4d54e9bfafd8bda8b0f33f804.tar.gz
fatcat-bde5c8f14e13afe4d54e9bfafd8bda8b0f33f804.zip
python: refactor importer code (+crossref)
-rwxr-xr-xpython/client.py19
-rw-r--r--python/fatcat/api_client.py177
-rw-r--r--python/fatcat/crossref_importer.py112
-rw-r--r--python/fatcat/importer_common.py53
-rw-r--r--python/fatcat/orcid_importer.py49
-rw-r--r--python/fatcat/raw_api_client.py66
-rw-r--r--python/tests/api_client.py15
-rw-r--r--python/tests/crossref.py16
-rw-r--r--python/tests/fixtures.py17
-rw-r--r--python/tests/orcid.py2
10 files changed, 274 insertions, 252 deletions
diff --git a/python/client.py b/python/client.py
index 14814512..9631318a 100755
--- a/python/client.py
+++ b/python/client.py
@@ -2,21 +2,21 @@
import sys
import argparse
-from fatcat.api_client import FatCatApiClient
+from fatcat.raw_api_client import RawFatcatApiClient
from fatcat.orcid_importer import FatcatOrcidImporter
def run_import_crossref(args):
- fcc = FatCatApiClient(args.host_url)
- fcc.import_crossref_file(args.json_file,
- create_containers=args.create_containers)
+ fcc = FatcatCrossrefClient(args.host_url)
+ fcc.import_crossref_file(args.json_file)
+ # create_containers=args.create_containers
def run_import_orcid(args):
foi = FatcatOrcidImporter(args.host_url)
foi.process_batch(args.json_file, size=args.batch_size)
def health(args):
- fcc = FatCatApiClient(args.host_url)
- print(fcc.health())
+ rfac = RawFatcatApiClient(args.host_url)
+ print(rfac.health())
def main():
parser = argparse.ArgumentParser()
@@ -32,9 +32,10 @@ def main():
sub_import_crossref.set_defaults(func=run_import_crossref)
sub_import_crossref.add_argument('json_file',
help="crossref JSON file to import from")
- sub_import_crossref.add_argument('--create-containers',
- action='store_true',
- help="if true, create containers based on ISSN")
+ # TODO:
+ #sub_import_crossref.add_argument('--create-containers',
+ # action='store_true',
+ # help="if true, create containers based on ISSN")
sub_import_orcid = subparsers.add_parser('import-orcid')
sub_import_orcid.set_defaults(func=run_import_orcid)
diff --git a/python/fatcat/api_client.py b/python/fatcat/api_client.py
deleted file mode 100644
index 4c000609..00000000
--- a/python/fatcat/api_client.py
+++ /dev/null
@@ -1,177 +0,0 @@
-
-import sys
-import json
-import requests
-
-
-class FatCatApiClient:
-
- def __init__(self, host_url):
- self.host_url = host_url
- self.session = requests.Session()
- self._issn_map = dict()
-
- def get(self, path, data=None):
- headers = {"content-type": "application/json"}
- return self.session.get(self.host_url + path, json=data,
- headers=headers)
-
- def post(self, path, data=None):
- headers = {"content-type": "application/json"}
- return self.session.post(self.host_url + path, json=data,
- headers=headers)
-
- def new_editgroup(self):
- rv = self.post('/v0/editgroup', data=dict(
- editor_id=1))
- print(rv)
- print(rv.json())
- assert rv.status_code == 201
- editgroup_id = rv.json()['id']
- return editgroup_id
-
- def accept_editgroup(self, eg):
- rv = self.post('/v0/editgroup/{}/accept'.format(eg))
- assert rv.status_code == 200
- return rv
-
- def lookup_issn(self, issn):
- assert len(issn) == 9 and issn[4] == '-'
- if issn in self._issn_map:
- return self._issn_map[issn]
- rv = self.get('/v0/container/lookup', data=dict(issn=issn))
- container_id = None
- if rv.status_code == 200:
- container_id = rv.json()['id']
- else:
- # only other valid response is a 404; otherwise we had an error
- assert rv.status_code == 404
- self._issn_map[issn] = container_id
- return container_id
-
- def import_crossref_file(self, json_file, create_containers=False, batchsize=100):
- eg = self.new_editgroup()
- i = 0
- with open(json_file, 'r') as file:
- for line in file:
- if i % batchsize == 0:
- sys.stdout.write('\n{}: '.format(i))
- if (i+1) % 20 == 0:
- sys.stdout.write('.')
- i = i + 1
- obj = json.loads(line)
- if not ("author" in obj and "title" in obj):
- continue
- try:
- self.import_crossref_dict(obj, editgroup=eg,
- create_containers=create_containers)
- except Exception as e:
- print("ERROR: {}".format(e))
- if i % batchsize == 0:
- self.accept_editgroup(eg)
- eg = self.new_editgroup()
- if i % batchsize != 0:
- self.accept_editgroup(eg)
- print("done!")
-
- def import_crossref_dict(self, meta, editgroup=None,
- create_containers=False):
-
- # creators
- creators = []
- for am in meta['author']:
- c = dict(name="{} {}".format(am['given'], am['family']),
- sortname="{}, {}".format(am['family'], am['given']),
- orcid=None)
- creators.append(c)
-
- # container
- issn = meta.get('ISSN', [None])[0]
- container_id = self.lookup_issn(issn)
- container = dict(
- issn=issn,
- name=meta['container-title'][0],
- container=container_id,
- #sortname=meta['short-container-title'][0])
- publisher=meta['publisher'])
-
- if container_id is None and create_containers and issn != None:
- rv = self.post('/v0/container', data=dict(
- issn=container['issn'],
- publisher=container['publisher']))
- assert rv.status_code == 201
- container_id = rv.json()['id']
- print("created container: {}".format(issn))
- container['id'] = container_id
- self._issn_map[issn] = container_id
-
- # references
- refs = []
- for i, rm in enumerate(meta.get('reference', [])):
- ref = dict(
- doi=rm.get("DOI", None),
- index=i+1,
- # TODO: how to generate a proper stub here from k/v metadata?
- stub="| ".join(rm.values()))
- refs.append(ref)
-
- # work and release
- title = meta['title'][0]
- rv = self.post('/v0/work',
- data=dict(title=title, editgroup=editgroup)) #work_type="book"
- assert rv.status_code == 201
- work_id = rv.json()['id']
-
- extra = dict(crossref={
- 'links': meta.get('link', []),
- 'subject': meta.get('subject'),
- 'type': meta['type'],
- 'alternative-id': meta.get('alternative-id', [])})
-
- rv = self.post('/v0/release', data=dict(
- title=title,
- work=work_id,
- # XXX: creators=creators,
- # XXX: refs=refs,
- # XXX: container=container_id,
- release_type=meta['type'],
- doi=meta['DOI'],
- date=meta['created']['date-time'],
- license=meta.get('license', [dict(URL=None)])[0]['URL'] or None,
- issue=meta.get('issue', None),
- volume=meta.get('volume', None),
- pages=meta.get('page', None),
- editgroup=editgroup,
- extra=extra))
- assert rv.status_code == 201
- release_id = rv.json()['id']
-
- def import_issn_file(self, json_file, create_containers=False, batchsize=100):
- eg = self.new_editgroup()
- i = 0
- with open(json_file, 'r') as file:
- for line in file:
- if i % batchsize == 0:
- sys.stdout.write('\n{}: '.format(i))
- if (i+1) % 20 == 0:
- sys.stdout.write('.')
- i = i + 1
- obj = json.loads(line)
- if not ("author" in obj and "title" in obj):
- continue
- try:
- self.import_crossref_dict(obj, editgroup=eg,
- create_containers=create_containers)
- except Exception as e:
- print("ERROR: {}".format(e))
- if i % batchsize == 0:
- self.accept_editgroup(eg)
- eg = self.new_editgroup()
- if i % batchsize != 0:
- self.accept_editgroup(eg)
- print("done!")
-
- def health(self):
- rv = self.get("/health")
- assert rv.status_code == 200
- return rv.json()
diff --git a/python/fatcat/crossref_importer.py b/python/fatcat/crossref_importer.py
new file mode 100644
index 00000000..4c68230d
--- /dev/null
+++ b/python/fatcat/crossref_importer.py
@@ -0,0 +1,112 @@
+
+import sys
+import json
+import itertools
+import fatcat_client
+from fatcat.importer_common import FatcatImporter
+
+
+class FatcatCrossrefImporter(FatcatImporter):
+
+ # TODO: overload __init__ to handle create_containers
+
+ def parse_crossref_dict(self, obj):
+ """
+ obj is a python dict (parsed from json).
+ returns a ReleaseEntity
+ """
+
+ # contribs
+ contribs = []
+ for i, am in enumerate(obj['author']):
+ contribs.append(fatcat_client.ReleaseContrib(
+ creator_id=None, # TODO: orcid lookup
+ index=i,
+ # Sorry humans :(
+ raw="{} {}".format(am['given'], am['family']),
+ role="author"))
+
+ # container
+ # TODO: ISSN vs. ISSN-L
+ issn = obj.get('ISSN', [None])[0]
+ container_id = self.lookup_issnl(issn)
+
+ ## TODO: create containers in-line like this?
+ #container = dict(
+ # issn=issn,
+ # name=obj['container-title'][0],
+ # container=container_id,
+ # #sortname=obj['short-container-title'][0])
+ # publisher=obj['publisher'])
+ #if container_id is None and self.create_containers and issn != None:
+ # rv = self.post('/v0/container', data=dict(
+ # issn=container['issn'],
+ # publisher=container['publisher']))
+ # assert rv.status_code == 201
+ # container_id = rv.json()['id']
+ # print("created container: {}".format(issn))
+ # container['id'] = container_id
+ # self._issn_map[issn] = container_id
+
+ # references
+ refs = []
+ for i, rm in enumerate(obj.get('reference', [])):
+ refs.append(fatcat_client.ReleaseRef(
+ index=i,
+ target_release_id=None, # TODO: DOI lookup: rm.get("DOI", None),
+ # TODO: all these
+ key=None,
+ year=None,
+ container_title=None,
+ title=None,
+ locator=None,
+ # TODO: how to generate a proper stub here from k/v objdata?
+ # TODO: just dump JSON here if we didn't get a match?
+ raw="| ".join(rm.values())))
+
+ # work
+ we = fatcat_client.WorkEntity(
+ work_type=obj['type'],
+ )
+
+ # release
+ extra = dict(crossref={
+ 'links': obj.get('link', []),
+ 'subject': obj.get('subject'),
+ 'crossref-type': obj['type'],
+ 'alternative-id': obj.get('alternative-id', [])})
+
+ re = fatcat_client.ReleaseEntity(
+ work_id='null', # XXX:
+ title=obj['title'][0],
+ contribs=contribs,
+ refs=refs,
+ container_id=container_id,
+ release_type=obj['type'],
+ doi=obj['DOI'],
+ release_date=obj['created']['date-time'],
+ #license=obj.get('license', [dict(URL=None)])[0]['URL'] or None,
+ issue=obj.get('issue'),
+ volume=obj.get('volume'),
+ pages=obj.get('page'),
+ extra=extra)
+ return (we, re)
+
+ def create_row(self, row, editgroup_id=None):
+ if row is None:
+ continue
+ obj = json.loads(row)
+ both = self.parse_crossref_dict(obj)
+ if both is not None:
+ (we, re) = both
+ we.editgroup_id = editgroup_id
+ re.editgroup_id = editgroup_id
+ created = self.api.create_work(we)
+ re.work_id = created.ident
+ self.api.create_release(re)
+
+ def create_batch(self, batch, editgroup_id=None):
+ """Current work/release pairing disallows batch creation of releases.
+ Could do batch work creation and then match against releases, but meh."""
+ for row in batch:
+ self.create_row(row, editgroup_id)
diff --git a/python/fatcat/importer_common.py b/python/fatcat/importer_common.py
new file mode 100644
index 00000000..98bfb26e
--- /dev/null
+++ b/python/fatcat/importer_common.py
@@ -0,0 +1,53 @@
+
+import sys
+import json
+import itertools
+import fatcat_client
+from fatcat_client.rest import ApiException
+
+# from: https://docs.python.org/3/library/itertools.html
+def grouper(iterable, n, fillvalue=None):
+ "Collect data into fixed-length chunks or blocks"
+ args = [iter(iterable)] * n
+ return itertools.zip_longest(*args, fillvalue=fillvalue)
+
+class FatcatImporter:
+
+ def __init__(self, host_url):
+ conf = fatcat_client.Configuration()
+ conf.host = host_url
+ self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf))
+ self._issnl_map = dict()
+
+ def process_source(self, source, group_size=100):
+ """Creates and auto-accepts editgropu every group_size rows"""
+ eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1))
+ for i, row in enumerate(source):
+ self.create_row(row, editgroup_id=eg.id)
+ if i > 0 and (i % group_size) == 0:
+ self.api.accept_editgroup(eg)
+ eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1))
+ if i == 0 or (i % group_size) != 0:
+ self.api.accept_editgroup(eg.id)
+
+ def process_batch(self, source, size=50):
+ """Reads and processes in batches (not API-call-per-)"""
+ for rows in grouper(source, size):
+ eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1))
+ self.create_batch(rows, eg.id)
+ self.api.accept_editgroup(eg.id)
+
+ def lookup_issnl(self, issnl):
+ """Caches calls to the ISSN-L lookup API endpoint in a local dict"""
+ assert len(issnl) == 9 and issnl[4] == '-'
+ if issnl in self._issnl_map:
+ return self._issnl_map[issn]
+ container_id = None
+ try:
+ rv = self.api.lookup_container(issnl=issnl)
+ container_id = rv.ident
+ except ApiException as ae:
+ # If anything other than a 404 (not found), something is wrong
+ assert ae.status == 404
+ self._issnl_map[issnl] = container_id # might be None
+ return container_id
diff --git a/python/fatcat/orcid_importer.py b/python/fatcat/orcid_importer.py
index ba8d0bd7..fb4716df 100644
--- a/python/fatcat/orcid_importer.py
+++ b/python/fatcat/orcid_importer.py
@@ -3,6 +3,8 @@ import sys
import json
import itertools
import fatcat_client
+from fatcat.importer_common import FatcatImporter
+
def value_or_none(e):
if type(e) == dict:
@@ -11,18 +13,7 @@ def value_or_none(e):
e = None
return e
-# from: https://docs.python.org/3/library/itertools.html
-def grouper(iterable, n, fillvalue=None):
- "Collect data into fixed-length chunks or blocks"
- args = [iter(iterable)] * n
- return itertools.zip_longest(*args, fillvalue=fillvalue)
-
-class FatcatOrcidImporter:
-
- def __init__(self, host_url):
- conf = fatcat_client.Configuration()
- conf.host = host_url
- self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf))
+class FatcatOrcidImporter(FatcatImporter):
def parse_orcid_dict(self, obj):
"""
@@ -47,34 +38,18 @@ class FatcatOrcidImporter:
extra=extra)
return ce
- def process_line(self, line, editgroup_id=None):
- """Doesn't accept the editgroup"""
- obj = json.loads(line)
+ def create_row(self, row, editgroup_id=None):
+ obj = json.loads(row)
ce = self.parse_orcid_dict(obj)
if ce is not None:
ce.editgroup_id = editgroup_id
self.api.create_creator(ce)
- def process_source(self, source, group_size=100):
- """Creates and auto-accepts editgropu every group_size lines"""
- eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1))
- for i, line in enumerate(source):
- self.process_line(line, editgroup_id=eg.id)
- if i > 0 and (i % group_size) == 0:
- self.api.accept_editgroup(eg)
- eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1))
- if i == 0 or (i % group_size) != 0:
- self.api.accept_editgroup(eg.id)
-
- def process_batch(self, source, size=50):
+ def create_batch(self, batch, editgroup_id=None):
"""Reads and processes in batches (not API-call-per-line)"""
- for lines in grouper(source, size):
- objects = [self.parse_orcid_dict(json.loads(l))
- for l in lines if l != None]
- objects = [o for o in objects if o != None]
- eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1))
- for o in objects:
- o.editgroup_id = eg.id
- self.api.create_creator_batch(objects)
- self.api.accept_editgroup(eg.id)
- print("inserted {}".format(len(objects)))
+ objects = [self.parse_orcid_dict(json.loads(l))
+ for l in batch if l != None]
+ objects = [o for o in objects if o != None]
+ for o in objects:
+ o.editgroup_id = editgroup_id
+ self.api.create_creator_batch(objects)
diff --git a/python/fatcat/raw_api_client.py b/python/fatcat/raw_api_client.py
new file mode 100644
index 00000000..75151ebb
--- /dev/null
+++ b/python/fatcat/raw_api_client.py
@@ -0,0 +1,66 @@
+
+import sys
+import json
+import requests
+
+
+class RawFatcatApiClient:
+
+ def __init__(self, host_url):
+ self.host_url = host_url
+ self.session = requests.Session()
+ self._issn_map = dict()
+
+ def get(self, path, data=None):
+ headers = {"content-type": "application/json"}
+ return self.session.get(self.host_url + path, json=data,
+ headers=headers)
+
+ def post(self, path, data=None):
+ headers = {"content-type": "application/json"}
+ return self.session.post(self.host_url + path, json=data,
+ headers=headers)
+
+ def new_editgroup(self):
+ rv = self.post('/v0/editgroup', data=dict(
+ editor_id=1))
+ print(rv)
+ print(rv.json())
+ assert rv.status_code == 201
+ editgroup_id = rv.json()['id']
+ return editgroup_id
+
+ def accept_editgroup(self, eg):
+ rv = self.post('/v0/editgroup/{}/accept'.format(eg))
+ assert rv.status_code == 200
+ return rv
+
+ def import_issn_file(self, json_file, create_containers=False, batchsize=100):
+ eg = self.new_editgroup()
+ i = 0
+ with open(json_file, 'r') as file:
+ for line in file:
+ if i % batchsize == 0:
+ sys.stdout.write('\n{}: '.format(i))
+ if (i+1) % 20 == 0:
+ sys.stdout.write('.')
+ i = i + 1
+ obj = json.loads(line)
+ if not ("author" in obj and "title" in obj):
+ continue
+ try:
+ self.import_crossref_dict(obj, editgroup=eg,
+ create_containers=create_containers)
+ except Exception as e:
+ print("ERROR: {}".format(e))
+ if i % batchsize == 0:
+ self.accept_editgroup(eg)
+ eg = self.new_editgroup()
+ if i % batchsize != 0:
+ self.accept_editgroup(eg)
+ print("done!")
+
+ def health(self):
+ rv = self.get("/health")
+ assert rv.status_code == 200
+ return rv.json()
diff --git a/python/tests/api_client.py b/python/tests/api_client.py
deleted file mode 100644
index 9d2ec302..00000000
--- a/python/tests/api_client.py
+++ /dev/null
@@ -1,15 +0,0 @@
-
-import pytest
-import fatcat.api_client
-from fixtures import *
-
-
-# TODO:
-#def test_client_health(api_client):
-# assert api_client.health() != None
-
-
-def test_import_crossref(api_client):
- api_client.import_crossref_file('tests/files/crossref-works.2018-01-21.badsample.json')
-
- # TODO: use API to check that entities actually created...
diff --git a/python/tests/crossref.py b/python/tests/crossref.py
new file mode 100644
index 00000000..e72f7d02
--- /dev/null
+++ b/python/tests/crossref.py
@@ -0,0 +1,16 @@
+
+import pytest
+from fatcat.crossref_importer import FatcatCrossrefImporter
+
+
+@pytest.fixture(scope="function")
+def crossref_importer():
+ yield FatcatCrossrefImporter("http://localhost:9411/v0")
+
+def test_crossref_importer_batch(crossref_importer):
+ with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f:
+ crossref_importer.process_batch(f)
+
+def test_crossref_importer(crossref_importer):
+ with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f:
+ crossref_importer.process_source(f)
diff --git a/python/tests/fixtures.py b/python/tests/fixtures.py
index b1682e79..c9da9253 100644
--- a/python/tests/fixtures.py
+++ b/python/tests/fixtures.py
@@ -19,23 +19,12 @@ def app(full_app):
@pytest.fixture(scope="function")
-def api_client(full_app):
-
- # TODO:
- #pid = os.fork()
- #if pid == 0:
- # full_app.testing = False
- # full_app.run(host="localhost", port=8444, debug=False)
- # os._exit(0)
- #
- #time.sleep(0.2)
- #yield fatcat.api_client.FatCatApiClient("http://localhost:8444")
- #os.kill(pid, signal.SIGKILL)
-
- yield fatcat.api_client.FatCatApiClient("http://localhost:9411")
+def raw_api_client():
+ yield fatcat.raw_api_client.RawFatcatApiClient("http://localhost:9411")
## Helpers ##################################################################
+# TODO: what are these even here for?
def check_entity_fields(e):
for key in ('rev', 'is_live', 'redirect_id'):
diff --git a/python/tests/orcid.py b/python/tests/orcid.py
index 86a23603..d0e99cfc 100644
--- a/python/tests/orcid.py
+++ b/python/tests/orcid.py
@@ -2,10 +2,12 @@
import pytest
from fatcat.orcid_importer import FatcatOrcidImporter
+
@pytest.fixture(scope="function")
def orcid_importer():
yield FatcatOrcidImporter("http://localhost:9411/v0")
+# TODO: use API to check that entities actually created...
def test_orcid_importer_batch(orcid_importer):
with open('tests/files/0000-0001-8254-7103.json', 'r') as f:
orcid_importer.process_batch(f)