diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2018-06-20 17:54:54 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2018-06-20 17:54:54 -0700 |
commit | 43e74c2e81c64d6d4f4e644cc5a6f75945ff660d (patch) | |
tree | 769cae1607f3b8b9fff43fce99028bda571c2145 /python/fatcat | |
parent | 381fe70c56b1a936d4eef676ee8ba546f6a3cf30 (diff) | |
download | fatcat-43e74c2e81c64d6d4f4e644cc5a6f75945ff660d.tar.gz fatcat-43e74c2e81c64d6d4f4e644cc5a6f75945ff660d.zip |
more progress on crossref+orcid importers
Diffstat (limited to 'python/fatcat')
-rw-r--r-- | python/fatcat/crossref_importer.py | 85 | ||||
-rw-r--r-- | python/fatcat/importer_common.py | 44 |
2 files changed, 84 insertions, 45 deletions
diff --git a/python/fatcat/crossref_importer.py b/python/fatcat/crossref_importer.py index 4c68230d..a7166bc3 100644 --- a/python/fatcat/crossref_importer.py +++ b/python/fatcat/crossref_importer.py @@ -8,7 +8,9 @@ from fatcat.importer_common import FatcatImporter class FatcatCrossrefImporter(FatcatImporter): - # TODO: overload __init__ to handle create_containers + def __init__(self, host_url, issn_map_file, create_containers=True): + super().__init__(host_url, issn_map_file) + self.create_containers = create_containers def parse_crossref_dict(self, obj): """ @@ -19,50 +21,49 @@ class FatcatCrossrefImporter(FatcatImporter): # contribs contribs = [] for i, am in enumerate(obj['author']): + creator_id = None + if 'ORCID' in am.keys(): + creator_id = self.lookup_orcid(am['ORCID'].split('/')[-1]) contribs.append(fatcat_client.ReleaseContrib( - creator_id=None, # TODO: orcid lookup - index=i, + creator_id=creator_id, + index=i+1, # Sorry humans :( raw="{} {}".format(am['given'], am['family']), role="author")) # container - # TODO: ISSN vs. ISSN-L issn = obj.get('ISSN', [None])[0] - container_id = self.lookup_issnl(issn) + issnl = self.issn2issnl(issn) + container_id = None + if issnl: + container_id = self.lookup_issnl(issnl) + publisher = obj['publisher'] - ## TODO: create containers in-line like this? - #container = dict( - # issn=issn, - # name=obj['container-title'][0], - # container=container_id, - # #sortname=obj['short-container-title'][0]) - # publisher=obj['publisher']) - #if container_id is None and self.create_containers and issn != None: - # rv = self.post('/v0/container', data=dict( - # issn=container['issn'], - # publisher=container['publisher'])) - # assert rv.status_code == 201 - # container_id = rv.json()['id'] - # print("created container: {}".format(issn)) - # container['id'] = container_id - # self._issn_map[issn] = container_id + ce = None + if container_id is None and self.create_containers and issnl != None: + ce = fatcat_client.ContainerEntity( + issnl=issnl, + publisher=publisher, + name=obj['container-title'][0]) + print("created container: {}".format(issnl)) # references refs = [] for i, rm in enumerate(obj.get('reference', [])): + try: + year = int(rm.get('year')) + except: + year = None refs.append(fatcat_client.ReleaseRef( - index=i, + index=i+1, target_release_id=None, # TODO: DOI lookup: rm.get("DOI", None), - # TODO: all these - key=None, - year=None, - container_title=None, - title=None, - locator=None, - # TODO: how to generate a proper stub here from k/v objdata? - # TODO: just dump JSON here if we didn't get a match? - raw="| ".join(rm.values()))) + # unreliable for crossref: key=rm['key'].split('|')[-1], + year=year, + container_title=rm.get('volume-title'), + title=rm.get('title'), + locator=rm.get('first-page'), + # TODO: just dump JSON somewhere here? + raw=rm.get('unstructured'))) # work we = fatcat_client.WorkEntity( @@ -73,34 +74,38 @@ class FatcatCrossrefImporter(FatcatImporter): extra = dict(crossref={ 'links': obj.get('link', []), 'subject': obj.get('subject'), - 'crossref-type': obj['type'], + 'type': obj['type'], + 'license': obj.get('license', [dict(URL=None)])[0]['URL'] or None, 'alternative-id': obj.get('alternative-id', [])}) re = fatcat_client.ReleaseEntity( - work_id='null', # XXX: + work_id='tbd', # gets set later, I promise! title=obj['title'][0], contribs=contribs, refs=refs, container_id=container_id, release_type=obj['type'], - doi=obj['DOI'], + doi=obj['DOI'].lower(), release_date=obj['created']['date-time'], - #license=obj.get('license', [dict(URL=None)])[0]['URL'] or None, issue=obj.get('issue'), volume=obj.get('volume'), pages=obj.get('page'), extra=extra) - return (we, re) + return (we, re, ce) def create_row(self, row, editgroup_id=None): if row is None: - continue + return obj = json.loads(row) - both = self.parse_crossref_dict(obj) - if both is not None: - (we, re) = both + entities = self.parse_crossref_dict(obj) + if entities is not None: + (we, re, ce) = entities we.editgroup_id = editgroup_id re.editgroup_id = editgroup_id + if ce is not None: + ce.editgroup_id = editgroup_id + container = self.api.create_container(ce) + re.container_id = container.ident created = self.api.create_work(we) re.work_id = created.ident self.api.create_release(re) diff --git a/python/fatcat/importer_common.py b/python/fatcat/importer_common.py index 98bfb26e..c24565b4 100644 --- a/python/fatcat/importer_common.py +++ b/python/fatcat/importer_common.py @@ -13,11 +13,15 @@ def grouper(iterable, n, fillvalue=None): class FatcatImporter: - def __init__(self, host_url): + def __init__(self, host_url, issn_map_file=None): conf = fatcat_client.Configuration() conf.host = host_url self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf)) - self._issnl_map = dict() + self._issnl_id_map = dict() + self._orcid_id_map = dict() + self._issn_issnl_map = None + if issn_map_file: + self.read_issn_map_file(issn_map_file) def process_source(self, source, group_size=100): """Creates and auto-accepts editgropu every group_size rows""" @@ -40,8 +44,8 @@ class FatcatImporter: def lookup_issnl(self, issnl): """Caches calls to the ISSN-L lookup API endpoint in a local dict""" assert len(issnl) == 9 and issnl[4] == '-' - if issnl in self._issnl_map: - return self._issnl_map[issn] + if issnl in self._issnl_id_map: + return self._issnl_id_map[issn] container_id = None try: rv = self.api.lookup_container(issnl=issnl) @@ -49,5 +53,35 @@ class FatcatImporter: except ApiException as ae: # If anything other than a 404 (not found), something is wrong assert ae.status == 404 - self._issnl_map[issnl] = container_id # might be None + self._issnl_id_map[issnl] = container_id # might be None return container_id + + def lookup_orcid(self, orcid): + """Caches calls to the Orcid lookup API endpoint in a local dict""" + assert len(orcid) == 19 and orcid[4] == '-' + if orcid in self._orcid_id_map: + return self._orcid_id_map[orcid] + creator_id = None + try: + rv = self.api.lookup_creator(orcid=orcid) + creator_id = rv.ident + except ApiException as ae: + # If anything other than a 404 (not found), something is wrong + assert ae.status == 404 + self._orcid_id_map[orcid] = creator_id # might be None + return creator_id + + def read_issn_map_file(self, issn_map_file): + self._issn_issnl_map = dict() + for line in issn_map_file: + if line.startswith("ISSN") or len(line) == 0: + continue + (issn, issnl) = line.split()[0:2] + self._issn_issnl_map[issn] = issnl + # double mapping makes lookups easy + self._issn_issnl_map[issnl] = issnl + + def issn2issnl(self, issn): + if issn is None: + return None + self._issn_issnl_map.get(issn) |