diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2018-06-20 09:37:37 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2018-06-20 09:37:37 -0700 |
commit | bde5c8f14e13afe4d54e9bfafd8bda8b0f33f804 (patch) | |
tree | 67d7039b1621bebdafd89539602c2b5d05332501 /python/fatcat/crossref_importer.py | |
parent | 698399c49edcefe33c012856b604985925969a77 (diff) | |
download | fatcat-bde5c8f14e13afe4d54e9bfafd8bda8b0f33f804.tar.gz fatcat-bde5c8f14e13afe4d54e9bfafd8bda8b0f33f804.zip |
python: refactor importer code (+crossref)
Diffstat (limited to 'python/fatcat/crossref_importer.py')
-rw-r--r-- | python/fatcat/crossref_importer.py | 112 |
1 files changed, 112 insertions, 0 deletions
diff --git a/python/fatcat/crossref_importer.py b/python/fatcat/crossref_importer.py new file mode 100644 index 00000000..4c68230d --- /dev/null +++ b/python/fatcat/crossref_importer.py @@ -0,0 +1,112 @@ + +import sys +import json +import itertools +import fatcat_client +from fatcat.importer_common import FatcatImporter + + +class FatcatCrossrefImporter(FatcatImporter): + + # TODO: overload __init__ to handle create_containers + + def parse_crossref_dict(self, obj): + """ + obj is a python dict (parsed from json). + returns a ReleaseEntity + """ + + # contribs + contribs = [] + for i, am in enumerate(obj['author']): + contribs.append(fatcat_client.ReleaseContrib( + creator_id=None, # TODO: orcid lookup + index=i, + # Sorry humans :( + raw="{} {}".format(am['given'], am['family']), + role="author")) + + # container + # TODO: ISSN vs. ISSN-L + issn = obj.get('ISSN', [None])[0] + container_id = self.lookup_issnl(issn) + + ## TODO: create containers in-line like this? + #container = dict( + # issn=issn, + # name=obj['container-title'][0], + # container=container_id, + # #sortname=obj['short-container-title'][0]) + # publisher=obj['publisher']) + #if container_id is None and self.create_containers and issn != None: + # rv = self.post('/v0/container', data=dict( + # issn=container['issn'], + # publisher=container['publisher'])) + # assert rv.status_code == 201 + # container_id = rv.json()['id'] + # print("created container: {}".format(issn)) + # container['id'] = container_id + # self._issn_map[issn] = container_id + + # references + refs = [] + for i, rm in enumerate(obj.get('reference', [])): + refs.append(fatcat_client.ReleaseRef( + index=i, + target_release_id=None, # TODO: DOI lookup: rm.get("DOI", None), + # TODO: all these + key=None, + year=None, + container_title=None, + title=None, + locator=None, + # TODO: how to generate a proper stub here from k/v objdata? + # TODO: just dump JSON here if we didn't get a match? + raw="| ".join(rm.values()))) + + # work + we = fatcat_client.WorkEntity( + work_type=obj['type'], + ) + + # release + extra = dict(crossref={ + 'links': obj.get('link', []), + 'subject': obj.get('subject'), + 'crossref-type': obj['type'], + 'alternative-id': obj.get('alternative-id', [])}) + + re = fatcat_client.ReleaseEntity( + work_id='null', # XXX: + title=obj['title'][0], + contribs=contribs, + refs=refs, + container_id=container_id, + release_type=obj['type'], + doi=obj['DOI'], + release_date=obj['created']['date-time'], + #license=obj.get('license', [dict(URL=None)])[0]['URL'] or None, + issue=obj.get('issue'), + volume=obj.get('volume'), + pages=obj.get('page'), + extra=extra) + return (we, re) + + def create_row(self, row, editgroup_id=None): + if row is None: + continue + obj = json.loads(row) + both = self.parse_crossref_dict(obj) + if both is not None: + (we, re) = both + we.editgroup_id = editgroup_id + re.editgroup_id = editgroup_id + created = self.api.create_work(we) + re.work_id = created.ident + self.api.create_release(re) + + def create_batch(self, batch, editgroup_id=None): + """Current work/release pairing disallows batch creation of releases. + Could do batch work creation and then match against releases, but meh.""" + for row in batch: + self.create_row(row, editgroup_id) |