diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2018-06-20 09:37:37 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2018-06-20 09:37:37 -0700 |
commit | bde5c8f14e13afe4d54e9bfafd8bda8b0f33f804 (patch) | |
tree | 67d7039b1621bebdafd89539602c2b5d05332501 | |
parent | 698399c49edcefe33c012856b604985925969a77 (diff) | |
download | fatcat-bde5c8f14e13afe4d54e9bfafd8bda8b0f33f804.tar.gz fatcat-bde5c8f14e13afe4d54e9bfafd8bda8b0f33f804.zip |
python: refactor importer code (+crossref)
-rwxr-xr-x | python/client.py | 19 | ||||
-rw-r--r-- | python/fatcat/api_client.py | 177 | ||||
-rw-r--r-- | python/fatcat/crossref_importer.py | 112 | ||||
-rw-r--r-- | python/fatcat/importer_common.py | 53 | ||||
-rw-r--r-- | python/fatcat/orcid_importer.py | 49 | ||||
-rw-r--r-- | python/fatcat/raw_api_client.py | 66 | ||||
-rw-r--r-- | python/tests/api_client.py | 15 | ||||
-rw-r--r-- | python/tests/crossref.py | 16 | ||||
-rw-r--r-- | python/tests/fixtures.py | 17 | ||||
-rw-r--r-- | python/tests/orcid.py | 2 |
10 files changed, 274 insertions, 252 deletions
diff --git a/python/client.py b/python/client.py index 14814512..9631318a 100755 --- a/python/client.py +++ b/python/client.py @@ -2,21 +2,21 @@ import sys import argparse -from fatcat.api_client import FatCatApiClient +from fatcat.raw_api_client import RawFatcatApiClient from fatcat.orcid_importer import FatcatOrcidImporter def run_import_crossref(args): - fcc = FatCatApiClient(args.host_url) - fcc.import_crossref_file(args.json_file, - create_containers=args.create_containers) + fcc = FatcatCrossrefClient(args.host_url) + fcc.import_crossref_file(args.json_file) + # create_containers=args.create_containers def run_import_orcid(args): foi = FatcatOrcidImporter(args.host_url) foi.process_batch(args.json_file, size=args.batch_size) def health(args): - fcc = FatCatApiClient(args.host_url) - print(fcc.health()) + rfac = RawFatcatApiClient(args.host_url) + print(rfac.health()) def main(): parser = argparse.ArgumentParser() @@ -32,9 +32,10 @@ def main(): sub_import_crossref.set_defaults(func=run_import_crossref) sub_import_crossref.add_argument('json_file', help="crossref JSON file to import from") - sub_import_crossref.add_argument('--create-containers', - action='store_true', - help="if true, create containers based on ISSN") + # TODO: + #sub_import_crossref.add_argument('--create-containers', + # action='store_true', + # help="if true, create containers based on ISSN") sub_import_orcid = subparsers.add_parser('import-orcid') sub_import_orcid.set_defaults(func=run_import_orcid) diff --git a/python/fatcat/api_client.py b/python/fatcat/api_client.py deleted file mode 100644 index 4c000609..00000000 --- a/python/fatcat/api_client.py +++ /dev/null @@ -1,177 +0,0 @@ - -import sys -import json -import requests - - -class FatCatApiClient: - - def __init__(self, host_url): - self.host_url = host_url - self.session = requests.Session() - self._issn_map = dict() - - def get(self, path, data=None): - headers = {"content-type": "application/json"} - return self.session.get(self.host_url + path, json=data, - headers=headers) - - def post(self, path, data=None): - headers = {"content-type": "application/json"} - return self.session.post(self.host_url + path, json=data, - headers=headers) - - def new_editgroup(self): - rv = self.post('/v0/editgroup', data=dict( - editor_id=1)) - print(rv) - print(rv.json()) - assert rv.status_code == 201 - editgroup_id = rv.json()['id'] - return editgroup_id - - def accept_editgroup(self, eg): - rv = self.post('/v0/editgroup/{}/accept'.format(eg)) - assert rv.status_code == 200 - return rv - - def lookup_issn(self, issn): - assert len(issn) == 9 and issn[4] == '-' - if issn in self._issn_map: - return self._issn_map[issn] - rv = self.get('/v0/container/lookup', data=dict(issn=issn)) - container_id = None - if rv.status_code == 200: - container_id = rv.json()['id'] - else: - # only other valid response is a 404; otherwise we had an error - assert rv.status_code == 404 - self._issn_map[issn] = container_id - return container_id - - def import_crossref_file(self, json_file, create_containers=False, batchsize=100): - eg = self.new_editgroup() - i = 0 - with open(json_file, 'r') as file: - for line in file: - if i % batchsize == 0: - sys.stdout.write('\n{}: '.format(i)) - if (i+1) % 20 == 0: - sys.stdout.write('.') - i = i + 1 - obj = json.loads(line) - if not ("author" in obj and "title" in obj): - continue - try: - self.import_crossref_dict(obj, editgroup=eg, - create_containers=create_containers) - except Exception as e: - print("ERROR: {}".format(e)) - if i % batchsize == 0: - self.accept_editgroup(eg) - eg = self.new_editgroup() - if i % batchsize != 0: - self.accept_editgroup(eg) - print("done!") - - def import_crossref_dict(self, meta, editgroup=None, - create_containers=False): - - # creators - creators = [] - for am in meta['author']: - c = dict(name="{} {}".format(am['given'], am['family']), - sortname="{}, {}".format(am['family'], am['given']), - orcid=None) - creators.append(c) - - # container - issn = meta.get('ISSN', [None])[0] - container_id = self.lookup_issn(issn) - container = dict( - issn=issn, - name=meta['container-title'][0], - container=container_id, - #sortname=meta['short-container-title'][0]) - publisher=meta['publisher']) - - if container_id is None and create_containers and issn != None: - rv = self.post('/v0/container', data=dict( - issn=container['issn'], - publisher=container['publisher'])) - assert rv.status_code == 201 - container_id = rv.json()['id'] - print("created container: {}".format(issn)) - container['id'] = container_id - self._issn_map[issn] = container_id - - # references - refs = [] - for i, rm in enumerate(meta.get('reference', [])): - ref = dict( - doi=rm.get("DOI", None), - index=i+1, - # TODO: how to generate a proper stub here from k/v metadata? - stub="| ".join(rm.values())) - refs.append(ref) - - # work and release - title = meta['title'][0] - rv = self.post('/v0/work', - data=dict(title=title, editgroup=editgroup)) #work_type="book" - assert rv.status_code == 201 - work_id = rv.json()['id'] - - extra = dict(crossref={ - 'links': meta.get('link', []), - 'subject': meta.get('subject'), - 'type': meta['type'], - 'alternative-id': meta.get('alternative-id', [])}) - - rv = self.post('/v0/release', data=dict( - title=title, - work=work_id, - # XXX: creators=creators, - # XXX: refs=refs, - # XXX: container=container_id, - release_type=meta['type'], - doi=meta['DOI'], - date=meta['created']['date-time'], - license=meta.get('license', [dict(URL=None)])[0]['URL'] or None, - issue=meta.get('issue', None), - volume=meta.get('volume', None), - pages=meta.get('page', None), - editgroup=editgroup, - extra=extra)) - assert rv.status_code == 201 - release_id = rv.json()['id'] - - def import_issn_file(self, json_file, create_containers=False, batchsize=100): - eg = self.new_editgroup() - i = 0 - with open(json_file, 'r') as file: - for line in file: - if i % batchsize == 0: - sys.stdout.write('\n{}: '.format(i)) - if (i+1) % 20 == 0: - sys.stdout.write('.') - i = i + 1 - obj = json.loads(line) - if not ("author" in obj and "title" in obj): - continue - try: - self.import_crossref_dict(obj, editgroup=eg, - create_containers=create_containers) - except Exception as e: - print("ERROR: {}".format(e)) - if i % batchsize == 0: - self.accept_editgroup(eg) - eg = self.new_editgroup() - if i % batchsize != 0: - self.accept_editgroup(eg) - print("done!") - - def health(self): - rv = self.get("/health") - assert rv.status_code == 200 - return rv.json() diff --git a/python/fatcat/crossref_importer.py b/python/fatcat/crossref_importer.py new file mode 100644 index 00000000..4c68230d --- /dev/null +++ b/python/fatcat/crossref_importer.py @@ -0,0 +1,112 @@ + +import sys +import json +import itertools +import fatcat_client +from fatcat.importer_common import FatcatImporter + + +class FatcatCrossrefImporter(FatcatImporter): + + # TODO: overload __init__ to handle create_containers + + def parse_crossref_dict(self, obj): + """ + obj is a python dict (parsed from json). + returns a ReleaseEntity + """ + + # contribs + contribs = [] + for i, am in enumerate(obj['author']): + contribs.append(fatcat_client.ReleaseContrib( + creator_id=None, # TODO: orcid lookup + index=i, + # Sorry humans :( + raw="{} {}".format(am['given'], am['family']), + role="author")) + + # container + # TODO: ISSN vs. ISSN-L + issn = obj.get('ISSN', [None])[0] + container_id = self.lookup_issnl(issn) + + ## TODO: create containers in-line like this? + #container = dict( + # issn=issn, + # name=obj['container-title'][0], + # container=container_id, + # #sortname=obj['short-container-title'][0]) + # publisher=obj['publisher']) + #if container_id is None and self.create_containers and issn != None: + # rv = self.post('/v0/container', data=dict( + # issn=container['issn'], + # publisher=container['publisher'])) + # assert rv.status_code == 201 + # container_id = rv.json()['id'] + # print("created container: {}".format(issn)) + # container['id'] = container_id + # self._issn_map[issn] = container_id + + # references + refs = [] + for i, rm in enumerate(obj.get('reference', [])): + refs.append(fatcat_client.ReleaseRef( + index=i, + target_release_id=None, # TODO: DOI lookup: rm.get("DOI", None), + # TODO: all these + key=None, + year=None, + container_title=None, + title=None, + locator=None, + # TODO: how to generate a proper stub here from k/v objdata? + # TODO: just dump JSON here if we didn't get a match? + raw="| ".join(rm.values()))) + + # work + we = fatcat_client.WorkEntity( + work_type=obj['type'], + ) + + # release + extra = dict(crossref={ + 'links': obj.get('link', []), + 'subject': obj.get('subject'), + 'crossref-type': obj['type'], + 'alternative-id': obj.get('alternative-id', [])}) + + re = fatcat_client.ReleaseEntity( + work_id='null', # XXX: + title=obj['title'][0], + contribs=contribs, + refs=refs, + container_id=container_id, + release_type=obj['type'], + doi=obj['DOI'], + release_date=obj['created']['date-time'], + #license=obj.get('license', [dict(URL=None)])[0]['URL'] or None, + issue=obj.get('issue'), + volume=obj.get('volume'), + pages=obj.get('page'), + extra=extra) + return (we, re) + + def create_row(self, row, editgroup_id=None): + if row is None: + continue + obj = json.loads(row) + both = self.parse_crossref_dict(obj) + if both is not None: + (we, re) = both + we.editgroup_id = editgroup_id + re.editgroup_id = editgroup_id + created = self.api.create_work(we) + re.work_id = created.ident + self.api.create_release(re) + + def create_batch(self, batch, editgroup_id=None): + """Current work/release pairing disallows batch creation of releases. + Could do batch work creation and then match against releases, but meh.""" + for row in batch: + self.create_row(row, editgroup_id) diff --git a/python/fatcat/importer_common.py b/python/fatcat/importer_common.py new file mode 100644 index 00000000..98bfb26e --- /dev/null +++ b/python/fatcat/importer_common.py @@ -0,0 +1,53 @@ + +import sys +import json +import itertools +import fatcat_client +from fatcat_client.rest import ApiException + +# from: https://docs.python.org/3/library/itertools.html +def grouper(iterable, n, fillvalue=None): + "Collect data into fixed-length chunks or blocks" + args = [iter(iterable)] * n + return itertools.zip_longest(*args, fillvalue=fillvalue) + +class FatcatImporter: + + def __init__(self, host_url): + conf = fatcat_client.Configuration() + conf.host = host_url + self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf)) + self._issnl_map = dict() + + def process_source(self, source, group_size=100): + """Creates and auto-accepts editgropu every group_size rows""" + eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1)) + for i, row in enumerate(source): + self.create_row(row, editgroup_id=eg.id) + if i > 0 and (i % group_size) == 0: + self.api.accept_editgroup(eg) + eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1)) + if i == 0 or (i % group_size) != 0: + self.api.accept_editgroup(eg.id) + + def process_batch(self, source, size=50): + """Reads and processes in batches (not API-call-per-)""" + for rows in grouper(source, size): + eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1)) + self.create_batch(rows, eg.id) + self.api.accept_editgroup(eg.id) + + def lookup_issnl(self, issnl): + """Caches calls to the ISSN-L lookup API endpoint in a local dict""" + assert len(issnl) == 9 and issnl[4] == '-' + if issnl in self._issnl_map: + return self._issnl_map[issn] + container_id = None + try: + rv = self.api.lookup_container(issnl=issnl) + container_id = rv.ident + except ApiException as ae: + # If anything other than a 404 (not found), something is wrong + assert ae.status == 404 + self._issnl_map[issnl] = container_id # might be None + return container_id diff --git a/python/fatcat/orcid_importer.py b/python/fatcat/orcid_importer.py index ba8d0bd7..fb4716df 100644 --- a/python/fatcat/orcid_importer.py +++ b/python/fatcat/orcid_importer.py @@ -3,6 +3,8 @@ import sys import json import itertools import fatcat_client +from fatcat.importer_common import FatcatImporter + def value_or_none(e): if type(e) == dict: @@ -11,18 +13,7 @@ def value_or_none(e): e = None return e -# from: https://docs.python.org/3/library/itertools.html -def grouper(iterable, n, fillvalue=None): - "Collect data into fixed-length chunks or blocks" - args = [iter(iterable)] * n - return itertools.zip_longest(*args, fillvalue=fillvalue) - -class FatcatOrcidImporter: - - def __init__(self, host_url): - conf = fatcat_client.Configuration() - conf.host = host_url - self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf)) +class FatcatOrcidImporter(FatcatImporter): def parse_orcid_dict(self, obj): """ @@ -47,34 +38,18 @@ class FatcatOrcidImporter: extra=extra) return ce - def process_line(self, line, editgroup_id=None): - """Doesn't accept the editgroup""" - obj = json.loads(line) + def create_row(self, row, editgroup_id=None): + obj = json.loads(row) ce = self.parse_orcid_dict(obj) if ce is not None: ce.editgroup_id = editgroup_id self.api.create_creator(ce) - def process_source(self, source, group_size=100): - """Creates and auto-accepts editgropu every group_size lines""" - eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1)) - for i, line in enumerate(source): - self.process_line(line, editgroup_id=eg.id) - if i > 0 and (i % group_size) == 0: - self.api.accept_editgroup(eg) - eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1)) - if i == 0 or (i % group_size) != 0: - self.api.accept_editgroup(eg.id) - - def process_batch(self, source, size=50): + def create_batch(self, batch, editgroup_id=None): """Reads and processes in batches (not API-call-per-line)""" - for lines in grouper(source, size): - objects = [self.parse_orcid_dict(json.loads(l)) - for l in lines if l != None] - objects = [o for o in objects if o != None] - eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1)) - for o in objects: - o.editgroup_id = eg.id - self.api.create_creator_batch(objects) - self.api.accept_editgroup(eg.id) - print("inserted {}".format(len(objects))) + objects = [self.parse_orcid_dict(json.loads(l)) + for l in batch if l != None] + objects = [o for o in objects if o != None] + for o in objects: + o.editgroup_id = editgroup_id + self.api.create_creator_batch(objects) diff --git a/python/fatcat/raw_api_client.py b/python/fatcat/raw_api_client.py new file mode 100644 index 00000000..75151ebb --- /dev/null +++ b/python/fatcat/raw_api_client.py @@ -0,0 +1,66 @@ + +import sys +import json +import requests + + +class RawFatcatApiClient: + + def __init__(self, host_url): + self.host_url = host_url + self.session = requests.Session() + self._issn_map = dict() + + def get(self, path, data=None): + headers = {"content-type": "application/json"} + return self.session.get(self.host_url + path, json=data, + headers=headers) + + def post(self, path, data=None): + headers = {"content-type": "application/json"} + return self.session.post(self.host_url + path, json=data, + headers=headers) + + def new_editgroup(self): + rv = self.post('/v0/editgroup', data=dict( + editor_id=1)) + print(rv) + print(rv.json()) + assert rv.status_code == 201 + editgroup_id = rv.json()['id'] + return editgroup_id + + def accept_editgroup(self, eg): + rv = self.post('/v0/editgroup/{}/accept'.format(eg)) + assert rv.status_code == 200 + return rv + + def import_issn_file(self, json_file, create_containers=False, batchsize=100): + eg = self.new_editgroup() + i = 0 + with open(json_file, 'r') as file: + for line in file: + if i % batchsize == 0: + sys.stdout.write('\n{}: '.format(i)) + if (i+1) % 20 == 0: + sys.stdout.write('.') + i = i + 1 + obj = json.loads(line) + if not ("author" in obj and "title" in obj): + continue + try: + self.import_crossref_dict(obj, editgroup=eg, + create_containers=create_containers) + except Exception as e: + print("ERROR: {}".format(e)) + if i % batchsize == 0: + self.accept_editgroup(eg) + eg = self.new_editgroup() + if i % batchsize != 0: + self.accept_editgroup(eg) + print("done!") + + def health(self): + rv = self.get("/health") + assert rv.status_code == 200 + return rv.json() diff --git a/python/tests/api_client.py b/python/tests/api_client.py deleted file mode 100644 index 9d2ec302..00000000 --- a/python/tests/api_client.py +++ /dev/null @@ -1,15 +0,0 @@ - -import pytest -import fatcat.api_client -from fixtures import * - - -# TODO: -#def test_client_health(api_client): -# assert api_client.health() != None - - -def test_import_crossref(api_client): - api_client.import_crossref_file('tests/files/crossref-works.2018-01-21.badsample.json') - - # TODO: use API to check that entities actually created... diff --git a/python/tests/crossref.py b/python/tests/crossref.py new file mode 100644 index 00000000..e72f7d02 --- /dev/null +++ b/python/tests/crossref.py @@ -0,0 +1,16 @@ + +import pytest +from fatcat.crossref_importer import FatcatCrossrefImporter + + +@pytest.fixture(scope="function") +def crossref_importer(): + yield FatcatCrossrefImporter("http://localhost:9411/v0") + +def test_crossref_importer_batch(crossref_importer): + with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f: + crossref_importer.process_batch(f) + +def test_crossref_importer(crossref_importer): + with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f: + crossref_importer.process_source(f) diff --git a/python/tests/fixtures.py b/python/tests/fixtures.py index b1682e79..c9da9253 100644 --- a/python/tests/fixtures.py +++ b/python/tests/fixtures.py @@ -19,23 +19,12 @@ def app(full_app): @pytest.fixture(scope="function") -def api_client(full_app): - - # TODO: - #pid = os.fork() - #if pid == 0: - # full_app.testing = False - # full_app.run(host="localhost", port=8444, debug=False) - # os._exit(0) - # - #time.sleep(0.2) - #yield fatcat.api_client.FatCatApiClient("http://localhost:8444") - #os.kill(pid, signal.SIGKILL) - - yield fatcat.api_client.FatCatApiClient("http://localhost:9411") +def raw_api_client(): + yield fatcat.raw_api_client.RawFatcatApiClient("http://localhost:9411") ## Helpers ################################################################## +# TODO: what are these even here for? def check_entity_fields(e): for key in ('rev', 'is_live', 'redirect_id'): diff --git a/python/tests/orcid.py b/python/tests/orcid.py index 86a23603..d0e99cfc 100644 --- a/python/tests/orcid.py +++ b/python/tests/orcid.py @@ -2,10 +2,12 @@ import pytest from fatcat.orcid_importer import FatcatOrcidImporter + @pytest.fixture(scope="function") def orcid_importer(): yield FatcatOrcidImporter("http://localhost:9411/v0") +# TODO: use API to check that entities actually created... def test_orcid_importer_batch(orcid_importer): with open('tests/files/0000-0001-8254-7103.json', 'r') as f: orcid_importer.process_batch(f) |