aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat/crossref_importer.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2018-06-20 09:37:37 -0700
committerBryan Newbold <bnewbold@robocracy.org>2018-06-20 09:37:37 -0700
commitbde5c8f14e13afe4d54e9bfafd8bda8b0f33f804 (patch)
tree67d7039b1621bebdafd89539602c2b5d05332501 /python/fatcat/crossref_importer.py
parent698399c49edcefe33c012856b604985925969a77 (diff)
downloadfatcat-bde5c8f14e13afe4d54e9bfafd8bda8b0f33f804.tar.gz
fatcat-bde5c8f14e13afe4d54e9bfafd8bda8b0f33f804.zip
python: refactor importer code (+crossref)
Diffstat (limited to 'python/fatcat/crossref_importer.py')
-rw-r--r--python/fatcat/crossref_importer.py112
1 files changed, 112 insertions, 0 deletions
diff --git a/python/fatcat/crossref_importer.py b/python/fatcat/crossref_importer.py
new file mode 100644
index 00000000..4c68230d
--- /dev/null
+++ b/python/fatcat/crossref_importer.py
@@ -0,0 +1,112 @@
+
+import sys
+import json
+import itertools
+import fatcat_client
+from fatcat.importer_common import FatcatImporter
+
+
+class FatcatCrossrefImporter(FatcatImporter):
+
+ # TODO: overload __init__ to handle create_containers
+
+ def parse_crossref_dict(self, obj):
+ """
+ obj is a python dict (parsed from json).
+ returns a ReleaseEntity
+ """
+
+ # contribs
+ contribs = []
+ for i, am in enumerate(obj['author']):
+ contribs.append(fatcat_client.ReleaseContrib(
+ creator_id=None, # TODO: orcid lookup
+ index=i,
+ # Sorry humans :(
+ raw="{} {}".format(am['given'], am['family']),
+ role="author"))
+
+ # container
+ # TODO: ISSN vs. ISSN-L
+ issn = obj.get('ISSN', [None])[0]
+ container_id = self.lookup_issnl(issn)
+
+ ## TODO: create containers in-line like this?
+ #container = dict(
+ # issn=issn,
+ # name=obj['container-title'][0],
+ # container=container_id,
+ # #sortname=obj['short-container-title'][0])
+ # publisher=obj['publisher'])
+ #if container_id is None and self.create_containers and issn != None:
+ # rv = self.post('/v0/container', data=dict(
+ # issn=container['issn'],
+ # publisher=container['publisher']))
+ # assert rv.status_code == 201
+ # container_id = rv.json()['id']
+ # print("created container: {}".format(issn))
+ # container['id'] = container_id
+ # self._issn_map[issn] = container_id
+
+ # references
+ refs = []
+ for i, rm in enumerate(obj.get('reference', [])):
+ refs.append(fatcat_client.ReleaseRef(
+ index=i,
+ target_release_id=None, # TODO: DOI lookup: rm.get("DOI", None),
+ # TODO: all these
+ key=None,
+ year=None,
+ container_title=None,
+ title=None,
+ locator=None,
+ # TODO: how to generate a proper stub here from k/v objdata?
+ # TODO: just dump JSON here if we didn't get a match?
+ raw="| ".join(rm.values())))
+
+ # work
+ we = fatcat_client.WorkEntity(
+ work_type=obj['type'],
+ )
+
+ # release
+ extra = dict(crossref={
+ 'links': obj.get('link', []),
+ 'subject': obj.get('subject'),
+ 'crossref-type': obj['type'],
+ 'alternative-id': obj.get('alternative-id', [])})
+
+ re = fatcat_client.ReleaseEntity(
+ work_id='null', # XXX:
+ title=obj['title'][0],
+ contribs=contribs,
+ refs=refs,
+ container_id=container_id,
+ release_type=obj['type'],
+ doi=obj['DOI'],
+ release_date=obj['created']['date-time'],
+ #license=obj.get('license', [dict(URL=None)])[0]['URL'] or None,
+ issue=obj.get('issue'),
+ volume=obj.get('volume'),
+ pages=obj.get('page'),
+ extra=extra)
+ return (we, re)
+
+ def create_row(self, row, editgroup_id=None):
+ if row is None:
+ continue
+ obj = json.loads(row)
+ both = self.parse_crossref_dict(obj)
+ if both is not None:
+ (we, re) = both
+ we.editgroup_id = editgroup_id
+ re.editgroup_id = editgroup_id
+ created = self.api.create_work(we)
+ re.work_id = created.ident
+ self.api.create_release(re)
+
+ def create_batch(self, batch, editgroup_id=None):
+ """Current work/release pairing disallows batch creation of releases.
+ Could do batch work creation and then match against releases, but meh."""
+ for row in batch:
+ self.create_row(row, editgroup_id)