diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2018-06-20 17:54:54 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2018-06-20 17:54:54 -0700 |
commit | 43e74c2e81c64d6d4f4e644cc5a6f75945ff660d (patch) | |
tree | 769cae1607f3b8b9fff43fce99028bda571c2145 /python/fatcat/crossref_importer.py | |
parent | 381fe70c56b1a936d4eef676ee8ba546f6a3cf30 (diff) | |
download | fatcat-43e74c2e81c64d6d4f4e644cc5a6f75945ff660d.tar.gz fatcat-43e74c2e81c64d6d4f4e644cc5a6f75945ff660d.zip |
more progress on crossref+orcid importers
Diffstat (limited to 'python/fatcat/crossref_importer.py')
-rw-r--r-- | python/fatcat/crossref_importer.py | 85 |
1 files changed, 45 insertions, 40 deletions
diff --git a/python/fatcat/crossref_importer.py b/python/fatcat/crossref_importer.py index 4c68230d..a7166bc3 100644 --- a/python/fatcat/crossref_importer.py +++ b/python/fatcat/crossref_importer.py @@ -8,7 +8,9 @@ from fatcat.importer_common import FatcatImporter class FatcatCrossrefImporter(FatcatImporter): - # TODO: overload __init__ to handle create_containers + def __init__(self, host_url, issn_map_file, create_containers=True): + super().__init__(host_url, issn_map_file) + self.create_containers = create_containers def parse_crossref_dict(self, obj): """ @@ -19,50 +21,49 @@ class FatcatCrossrefImporter(FatcatImporter): # contribs contribs = [] for i, am in enumerate(obj['author']): + creator_id = None + if 'ORCID' in am.keys(): + creator_id = self.lookup_orcid(am['ORCID'].split('/')[-1]) contribs.append(fatcat_client.ReleaseContrib( - creator_id=None, # TODO: orcid lookup - index=i, + creator_id=creator_id, + index=i+1, # Sorry humans :( raw="{} {}".format(am['given'], am['family']), role="author")) # container - # TODO: ISSN vs. ISSN-L issn = obj.get('ISSN', [None])[0] - container_id = self.lookup_issnl(issn) + issnl = self.issn2issnl(issn) + container_id = None + if issnl: + container_id = self.lookup_issnl(issnl) + publisher = obj['publisher'] - ## TODO: create containers in-line like this? - #container = dict( - # issn=issn, - # name=obj['container-title'][0], - # container=container_id, - # #sortname=obj['short-container-title'][0]) - # publisher=obj['publisher']) - #if container_id is None and self.create_containers and issn != None: - # rv = self.post('/v0/container', data=dict( - # issn=container['issn'], - # publisher=container['publisher'])) - # assert rv.status_code == 201 - # container_id = rv.json()['id'] - # print("created container: {}".format(issn)) - # container['id'] = container_id - # self._issn_map[issn] = container_id + ce = None + if container_id is None and self.create_containers and issnl != None: + ce = fatcat_client.ContainerEntity( + issnl=issnl, + publisher=publisher, + name=obj['container-title'][0]) + print("created container: {}".format(issnl)) # references refs = [] for i, rm in enumerate(obj.get('reference', [])): + try: + year = int(rm.get('year')) + except: + year = None refs.append(fatcat_client.ReleaseRef( - index=i, + index=i+1, target_release_id=None, # TODO: DOI lookup: rm.get("DOI", None), - # TODO: all these - key=None, - year=None, - container_title=None, - title=None, - locator=None, - # TODO: how to generate a proper stub here from k/v objdata? - # TODO: just dump JSON here if we didn't get a match? - raw="| ".join(rm.values()))) + # unreliable for crossref: key=rm['key'].split('|')[-1], + year=year, + container_title=rm.get('volume-title'), + title=rm.get('title'), + locator=rm.get('first-page'), + # TODO: just dump JSON somewhere here? + raw=rm.get('unstructured'))) # work we = fatcat_client.WorkEntity( @@ -73,34 +74,38 @@ class FatcatCrossrefImporter(FatcatImporter): extra = dict(crossref={ 'links': obj.get('link', []), 'subject': obj.get('subject'), - 'crossref-type': obj['type'], + 'type': obj['type'], + 'license': obj.get('license', [dict(URL=None)])[0]['URL'] or None, 'alternative-id': obj.get('alternative-id', [])}) re = fatcat_client.ReleaseEntity( - work_id='null', # XXX: + work_id='tbd', # gets set later, I promise! title=obj['title'][0], contribs=contribs, refs=refs, container_id=container_id, release_type=obj['type'], - doi=obj['DOI'], + doi=obj['DOI'].lower(), release_date=obj['created']['date-time'], - #license=obj.get('license', [dict(URL=None)])[0]['URL'] or None, issue=obj.get('issue'), volume=obj.get('volume'), pages=obj.get('page'), extra=extra) - return (we, re) + return (we, re, ce) def create_row(self, row, editgroup_id=None): if row is None: - continue + return obj = json.loads(row) - both = self.parse_crossref_dict(obj) - if both is not None: - (we, re) = both + entities = self.parse_crossref_dict(obj) + if entities is not None: + (we, re, ce) = entities we.editgroup_id = editgroup_id re.editgroup_id = editgroup_id + if ce is not None: + ce.editgroup_id = editgroup_id + container = self.api.create_container(ce) + re.container_id = container.ident created = self.api.create_work(we) re.work_id = created.ident self.api.create_release(re) |