diff options
Diffstat (limited to 'python/fatcat')
-rw-r--r-- | python/fatcat/api_client.py | 177 | ||||
-rw-r--r-- | python/fatcat/crossref_importer.py | 112 | ||||
-rw-r--r-- | python/fatcat/importer_common.py | 53 | ||||
-rw-r--r-- | python/fatcat/orcid_importer.py | 49 | ||||
-rw-r--r-- | python/fatcat/raw_api_client.py | 66 |
5 files changed, 243 insertions, 214 deletions
diff --git a/python/fatcat/api_client.py b/python/fatcat/api_client.py deleted file mode 100644 index 4c000609..00000000 --- a/python/fatcat/api_client.py +++ /dev/null @@ -1,177 +0,0 @@ - -import sys -import json -import requests - - -class FatCatApiClient: - - def __init__(self, host_url): - self.host_url = host_url - self.session = requests.Session() - self._issn_map = dict() - - def get(self, path, data=None): - headers = {"content-type": "application/json"} - return self.session.get(self.host_url + path, json=data, - headers=headers) - - def post(self, path, data=None): - headers = {"content-type": "application/json"} - return self.session.post(self.host_url + path, json=data, - headers=headers) - - def new_editgroup(self): - rv = self.post('/v0/editgroup', data=dict( - editor_id=1)) - print(rv) - print(rv.json()) - assert rv.status_code == 201 - editgroup_id = rv.json()['id'] - return editgroup_id - - def accept_editgroup(self, eg): - rv = self.post('/v0/editgroup/{}/accept'.format(eg)) - assert rv.status_code == 200 - return rv - - def lookup_issn(self, issn): - assert len(issn) == 9 and issn[4] == '-' - if issn in self._issn_map: - return self._issn_map[issn] - rv = self.get('/v0/container/lookup', data=dict(issn=issn)) - container_id = None - if rv.status_code == 200: - container_id = rv.json()['id'] - else: - # only other valid response is a 404; otherwise we had an error - assert rv.status_code == 404 - self._issn_map[issn] = container_id - return container_id - - def import_crossref_file(self, json_file, create_containers=False, batchsize=100): - eg = self.new_editgroup() - i = 0 - with open(json_file, 'r') as file: - for line in file: - if i % batchsize == 0: - sys.stdout.write('\n{}: '.format(i)) - if (i+1) % 20 == 0: - sys.stdout.write('.') - i = i + 1 - obj = json.loads(line) - if not ("author" in obj and "title" in obj): - continue - try: - self.import_crossref_dict(obj, editgroup=eg, - create_containers=create_containers) - except Exception as e: - print("ERROR: {}".format(e)) - if i % batchsize == 0: - self.accept_editgroup(eg) - eg = self.new_editgroup() - if i % batchsize != 0: - self.accept_editgroup(eg) - print("done!") - - def import_crossref_dict(self, meta, editgroup=None, - create_containers=False): - - # creators - creators = [] - for am in meta['author']: - c = dict(name="{} {}".format(am['given'], am['family']), - sortname="{}, {}".format(am['family'], am['given']), - orcid=None) - creators.append(c) - - # container - issn = meta.get('ISSN', [None])[0] - container_id = self.lookup_issn(issn) - container = dict( - issn=issn, - name=meta['container-title'][0], - container=container_id, - #sortname=meta['short-container-title'][0]) - publisher=meta['publisher']) - - if container_id is None and create_containers and issn != None: - rv = self.post('/v0/container', data=dict( - issn=container['issn'], - publisher=container['publisher'])) - assert rv.status_code == 201 - container_id = rv.json()['id'] - print("created container: {}".format(issn)) - container['id'] = container_id - self._issn_map[issn] = container_id - - # references - refs = [] - for i, rm in enumerate(meta.get('reference', [])): - ref = dict( - doi=rm.get("DOI", None), - index=i+1, - # TODO: how to generate a proper stub here from k/v metadata? - stub="| ".join(rm.values())) - refs.append(ref) - - # work and release - title = meta['title'][0] - rv = self.post('/v0/work', - data=dict(title=title, editgroup=editgroup)) #work_type="book" - assert rv.status_code == 201 - work_id = rv.json()['id'] - - extra = dict(crossref={ - 'links': meta.get('link', []), - 'subject': meta.get('subject'), - 'type': meta['type'], - 'alternative-id': meta.get('alternative-id', [])}) - - rv = self.post('/v0/release', data=dict( - title=title, - work=work_id, - # XXX: creators=creators, - # XXX: refs=refs, - # XXX: container=container_id, - release_type=meta['type'], - doi=meta['DOI'], - date=meta['created']['date-time'], - license=meta.get('license', [dict(URL=None)])[0]['URL'] or None, - issue=meta.get('issue', None), - volume=meta.get('volume', None), - pages=meta.get('page', None), - editgroup=editgroup, - extra=extra)) - assert rv.status_code == 201 - release_id = rv.json()['id'] - - def import_issn_file(self, json_file, create_containers=False, batchsize=100): - eg = self.new_editgroup() - i = 0 - with open(json_file, 'r') as file: - for line in file: - if i % batchsize == 0: - sys.stdout.write('\n{}: '.format(i)) - if (i+1) % 20 == 0: - sys.stdout.write('.') - i = i + 1 - obj = json.loads(line) - if not ("author" in obj and "title" in obj): - continue - try: - self.import_crossref_dict(obj, editgroup=eg, - create_containers=create_containers) - except Exception as e: - print("ERROR: {}".format(e)) - if i % batchsize == 0: - self.accept_editgroup(eg) - eg = self.new_editgroup() - if i % batchsize != 0: - self.accept_editgroup(eg) - print("done!") - - def health(self): - rv = self.get("/health") - assert rv.status_code == 200 - return rv.json() diff --git a/python/fatcat/crossref_importer.py b/python/fatcat/crossref_importer.py new file mode 100644 index 00000000..4c68230d --- /dev/null +++ b/python/fatcat/crossref_importer.py @@ -0,0 +1,112 @@ + +import sys +import json +import itertools +import fatcat_client +from fatcat.importer_common import FatcatImporter + + +class FatcatCrossrefImporter(FatcatImporter): + + # TODO: overload __init__ to handle create_containers + + def parse_crossref_dict(self, obj): + """ + obj is a python dict (parsed from json). + returns a ReleaseEntity + """ + + # contribs + contribs = [] + for i, am in enumerate(obj['author']): + contribs.append(fatcat_client.ReleaseContrib( + creator_id=None, # TODO: orcid lookup + index=i, + # Sorry humans :( + raw="{} {}".format(am['given'], am['family']), + role="author")) + + # container + # TODO: ISSN vs. ISSN-L + issn = obj.get('ISSN', [None])[0] + container_id = self.lookup_issnl(issn) + + ## TODO: create containers in-line like this? + #container = dict( + # issn=issn, + # name=obj['container-title'][0], + # container=container_id, + # #sortname=obj['short-container-title'][0]) + # publisher=obj['publisher']) + #if container_id is None and self.create_containers and issn != None: + # rv = self.post('/v0/container', data=dict( + # issn=container['issn'], + # publisher=container['publisher'])) + # assert rv.status_code == 201 + # container_id = rv.json()['id'] + # print("created container: {}".format(issn)) + # container['id'] = container_id + # self._issn_map[issn] = container_id + + # references + refs = [] + for i, rm in enumerate(obj.get('reference', [])): + refs.append(fatcat_client.ReleaseRef( + index=i, + target_release_id=None, # TODO: DOI lookup: rm.get("DOI", None), + # TODO: all these + key=None, + year=None, + container_title=None, + title=None, + locator=None, + # TODO: how to generate a proper stub here from k/v objdata? + # TODO: just dump JSON here if we didn't get a match? + raw="| ".join(rm.values()))) + + # work + we = fatcat_client.WorkEntity( + work_type=obj['type'], + ) + + # release + extra = dict(crossref={ + 'links': obj.get('link', []), + 'subject': obj.get('subject'), + 'crossref-type': obj['type'], + 'alternative-id': obj.get('alternative-id', [])}) + + re = fatcat_client.ReleaseEntity( + work_id='null', # XXX: + title=obj['title'][0], + contribs=contribs, + refs=refs, + container_id=container_id, + release_type=obj['type'], + doi=obj['DOI'], + release_date=obj['created']['date-time'], + #license=obj.get('license', [dict(URL=None)])[0]['URL'] or None, + issue=obj.get('issue'), + volume=obj.get('volume'), + pages=obj.get('page'), + extra=extra) + return (we, re) + + def create_row(self, row, editgroup_id=None): + if row is None: + continue + obj = json.loads(row) + both = self.parse_crossref_dict(obj) + if both is not None: + (we, re) = both + we.editgroup_id = editgroup_id + re.editgroup_id = editgroup_id + created = self.api.create_work(we) + re.work_id = created.ident + self.api.create_release(re) + + def create_batch(self, batch, editgroup_id=None): + """Current work/release pairing disallows batch creation of releases. + Could do batch work creation and then match against releases, but meh.""" + for row in batch: + self.create_row(row, editgroup_id) diff --git a/python/fatcat/importer_common.py b/python/fatcat/importer_common.py new file mode 100644 index 00000000..98bfb26e --- /dev/null +++ b/python/fatcat/importer_common.py @@ -0,0 +1,53 @@ + +import sys +import json +import itertools +import fatcat_client +from fatcat_client.rest import ApiException + +# from: https://docs.python.org/3/library/itertools.html +def grouper(iterable, n, fillvalue=None): + "Collect data into fixed-length chunks or blocks" + args = [iter(iterable)] * n + return itertools.zip_longest(*args, fillvalue=fillvalue) + +class FatcatImporter: + + def __init__(self, host_url): + conf = fatcat_client.Configuration() + conf.host = host_url + self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf)) + self._issnl_map = dict() + + def process_source(self, source, group_size=100): + """Creates and auto-accepts editgropu every group_size rows""" + eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1)) + for i, row in enumerate(source): + self.create_row(row, editgroup_id=eg.id) + if i > 0 and (i % group_size) == 0: + self.api.accept_editgroup(eg) + eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1)) + if i == 0 or (i % group_size) != 0: + self.api.accept_editgroup(eg.id) + + def process_batch(self, source, size=50): + """Reads and processes in batches (not API-call-per-)""" + for rows in grouper(source, size): + eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1)) + self.create_batch(rows, eg.id) + self.api.accept_editgroup(eg.id) + + def lookup_issnl(self, issnl): + """Caches calls to the ISSN-L lookup API endpoint in a local dict""" + assert len(issnl) == 9 and issnl[4] == '-' + if issnl in self._issnl_map: + return self._issnl_map[issn] + container_id = None + try: + rv = self.api.lookup_container(issnl=issnl) + container_id = rv.ident + except ApiException as ae: + # If anything other than a 404 (not found), something is wrong + assert ae.status == 404 + self._issnl_map[issnl] = container_id # might be None + return container_id diff --git a/python/fatcat/orcid_importer.py b/python/fatcat/orcid_importer.py index ba8d0bd7..fb4716df 100644 --- a/python/fatcat/orcid_importer.py +++ b/python/fatcat/orcid_importer.py @@ -3,6 +3,8 @@ import sys import json import itertools import fatcat_client +from fatcat.importer_common import FatcatImporter + def value_or_none(e): if type(e) == dict: @@ -11,18 +13,7 @@ def value_or_none(e): e = None return e -# from: https://docs.python.org/3/library/itertools.html -def grouper(iterable, n, fillvalue=None): - "Collect data into fixed-length chunks or blocks" - args = [iter(iterable)] * n - return itertools.zip_longest(*args, fillvalue=fillvalue) - -class FatcatOrcidImporter: - - def __init__(self, host_url): - conf = fatcat_client.Configuration() - conf.host = host_url - self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf)) +class FatcatOrcidImporter(FatcatImporter): def parse_orcid_dict(self, obj): """ @@ -47,34 +38,18 @@ class FatcatOrcidImporter: extra=extra) return ce - def process_line(self, line, editgroup_id=None): - """Doesn't accept the editgroup""" - obj = json.loads(line) + def create_row(self, row, editgroup_id=None): + obj = json.loads(row) ce = self.parse_orcid_dict(obj) if ce is not None: ce.editgroup_id = editgroup_id self.api.create_creator(ce) - def process_source(self, source, group_size=100): - """Creates and auto-accepts editgropu every group_size lines""" - eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1)) - for i, line in enumerate(source): - self.process_line(line, editgroup_id=eg.id) - if i > 0 and (i % group_size) == 0: - self.api.accept_editgroup(eg) - eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1)) - if i == 0 or (i % group_size) != 0: - self.api.accept_editgroup(eg.id) - - def process_batch(self, source, size=50): + def create_batch(self, batch, editgroup_id=None): """Reads and processes in batches (not API-call-per-line)""" - for lines in grouper(source, size): - objects = [self.parse_orcid_dict(json.loads(l)) - for l in lines if l != None] - objects = [o for o in objects if o != None] - eg = self.api.create_editgroup(fatcat_client.Editgroup(editor_id=1)) - for o in objects: - o.editgroup_id = eg.id - self.api.create_creator_batch(objects) - self.api.accept_editgroup(eg.id) - print("inserted {}".format(len(objects))) + objects = [self.parse_orcid_dict(json.loads(l)) + for l in batch if l != None] + objects = [o for o in objects if o != None] + for o in objects: + o.editgroup_id = editgroup_id + self.api.create_creator_batch(objects) diff --git a/python/fatcat/raw_api_client.py b/python/fatcat/raw_api_client.py new file mode 100644 index 00000000..75151ebb --- /dev/null +++ b/python/fatcat/raw_api_client.py @@ -0,0 +1,66 @@ + +import sys +import json +import requests + + +class RawFatcatApiClient: + + def __init__(self, host_url): + self.host_url = host_url + self.session = requests.Session() + self._issn_map = dict() + + def get(self, path, data=None): + headers = {"content-type": "application/json"} + return self.session.get(self.host_url + path, json=data, + headers=headers) + + def post(self, path, data=None): + headers = {"content-type": "application/json"} + return self.session.post(self.host_url + path, json=data, + headers=headers) + + def new_editgroup(self): + rv = self.post('/v0/editgroup', data=dict( + editor_id=1)) + print(rv) + print(rv.json()) + assert rv.status_code == 201 + editgroup_id = rv.json()['id'] + return editgroup_id + + def accept_editgroup(self, eg): + rv = self.post('/v0/editgroup/{}/accept'.format(eg)) + assert rv.status_code == 200 + return rv + + def import_issn_file(self, json_file, create_containers=False, batchsize=100): + eg = self.new_editgroup() + i = 0 + with open(json_file, 'r') as file: + for line in file: + if i % batchsize == 0: + sys.stdout.write('\n{}: '.format(i)) + if (i+1) % 20 == 0: + sys.stdout.write('.') + i = i + 1 + obj = json.loads(line) + if not ("author" in obj and "title" in obj): + continue + try: + self.import_crossref_dict(obj, editgroup=eg, + create_containers=create_containers) + except Exception as e: + print("ERROR: {}".format(e)) + if i % batchsize == 0: + self.accept_editgroup(eg) + eg = self.new_editgroup() + if i % batchsize != 0: + self.accept_editgroup(eg) + print("done!") + + def health(self): + rv = self.get("/health") + assert rv.status_code == 200 + return rv.json() |