aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2018-06-20 17:54:54 -0700
committerBryan Newbold <bnewbold@robocracy.org>2018-06-20 17:54:54 -0700
commit43e74c2e81c64d6d4f4e644cc5a6f75945ff660d (patch)
tree769cae1607f3b8b9fff43fce99028bda571c2145 /python/fatcat
parent381fe70c56b1a936d4eef676ee8ba546f6a3cf30 (diff)
downloadfatcat-43e74c2e81c64d6d4f4e644cc5a6f75945ff660d.tar.gz
fatcat-43e74c2e81c64d6d4f4e644cc5a6f75945ff660d.zip
more progress on crossref+orcid importers
Diffstat (limited to 'python/fatcat')
-rw-r--r--python/fatcat/crossref_importer.py85
-rw-r--r--python/fatcat/importer_common.py44
2 files changed, 84 insertions, 45 deletions
diff --git a/python/fatcat/crossref_importer.py b/python/fatcat/crossref_importer.py
index 4c68230d..a7166bc3 100644
--- a/python/fatcat/crossref_importer.py
+++ b/python/fatcat/crossref_importer.py
@@ -8,7 +8,9 @@ from fatcat.importer_common import FatcatImporter
class FatcatCrossrefImporter(FatcatImporter):
- # TODO: overload __init__ to handle create_containers
+ def __init__(self, host_url, issn_map_file, create_containers=True):
+ super().__init__(host_url, issn_map_file)
+ self.create_containers = create_containers
def parse_crossref_dict(self, obj):
"""
@@ -19,50 +21,49 @@ class FatcatCrossrefImporter(FatcatImporter):
# contribs
contribs = []
for i, am in enumerate(obj['author']):
+ creator_id = None
+ if 'ORCID' in am.keys():
+ creator_id = self.lookup_orcid(am['ORCID'].split('/')[-1])
contribs.append(fatcat_client.ReleaseContrib(
- creator_id=None, # TODO: orcid lookup
- index=i,
+ creator_id=creator_id,
+ index=i+1,
# Sorry humans :(
raw="{} {}".format(am['given'], am['family']),
role="author"))
# container
- # TODO: ISSN vs. ISSN-L
issn = obj.get('ISSN', [None])[0]
- container_id = self.lookup_issnl(issn)
+ issnl = self.issn2issnl(issn)
+ container_id = None
+ if issnl:
+ container_id = self.lookup_issnl(issnl)
+ publisher = obj['publisher']
- ## TODO: create containers in-line like this?
- #container = dict(
- # issn=issn,
- # name=obj['container-title'][0],
- # container=container_id,
- # #sortname=obj['short-container-title'][0])
- # publisher=obj['publisher'])
- #if container_id is None and self.create_containers and issn != None:
- # rv = self.post('/v0/container', data=dict(
- # issn=container['issn'],
- # publisher=container['publisher']))
- # assert rv.status_code == 201
- # container_id = rv.json()['id']
- # print("created container: {}".format(issn))
- # container['id'] = container_id
- # self._issn_map[issn] = container_id
+ ce = None
+ if container_id is None and self.create_containers and issnl != None:
+ ce = fatcat_client.ContainerEntity(
+ issnl=issnl,
+ publisher=publisher,
+ name=obj['container-title'][0])
+ print("created container: {}".format(issnl))
# references
refs = []
for i, rm in enumerate(obj.get('reference', [])):
+ try:
+ year = int(rm.get('year'))
+ except:
+ year = None
refs.append(fatcat_client.ReleaseRef(
- index=i,
+ index=i+1,
target_release_id=None, # TODO: DOI lookup: rm.get("DOI", None),
- # TODO: all these
- key=None,
- year=None,
- container_title=None,
- title=None,
- locator=None,
- # TODO: how to generate a proper stub here from k/v objdata?
- # TODO: just dump JSON here if we didn't get a match?
- raw="| ".join(rm.values())))
+ # unreliable for crossref: key=rm['key'].split('|')[-1],
+ year=year,
+ container_title=rm.get('volume-title'),
+ title=rm.get('title'),
+ locator=rm.get('first-page'),
+ # TODO: just dump JSON somewhere here?
+ raw=rm.get('unstructured')))
# work
we = fatcat_client.WorkEntity(
@@ -73,34 +74,38 @@ class FatcatCrossrefImporter(FatcatImporter):
extra = dict(crossref={
'links': obj.get('link', []),
'subject': obj.get('subject'),
- 'crossref-type': obj['type'],
+ 'type': obj['type'],
+ 'license': obj.get('license', [dict(URL=None)])[0]['URL'] or None,
'alternative-id': obj.get('alternative-id', [])})
re = fatcat_client.ReleaseEntity(
- work_id='null', # XXX:
+ work_id='tbd', # gets set later, I promise!
title=obj['title'][0],
contribs=contribs,
refs=refs,
container_id=container_id,
release_type=obj['type'],
- doi=obj['DOI'],
+ doi=obj['DOI'].lower(),
release_date=obj['created']['date-time'],
- #license=obj.get('license', [dict(URL=None)])[0]['URL'] or None,
issue=obj.get('issue'),
volume=obj.get('volume'),
pages=obj.get('page'),
extra=extra)
- return (we, re)
+ return (we, re, ce)
def create_row(self, row, editgroup_id=None):
if row is None:
- continue
+ return
obj = json.loads(row)
- both = self.parse_crossref_dict(obj)
- if both is not None:
- (we, re) = both
+ entities = self.parse_crossref_dict(obj)
+ if entities is not None:
+ (we, re, ce) = entities
we.editgroup_id = editgroup_id
re.editgroup_id = editgroup_id
+ if ce is not None:
+ ce.editgroup_id = editgroup_id
+ container = self.api.create_container(ce)
+ re.container_id = container.ident
created = self.api.create_work(we)
re.work_id = created.ident
self.api.create_release(re)
diff --git a/python/fatcat/importer_common.py b/python/fatcat/importer_common.py
index 98bfb26e..c24565b4 100644
--- a/python/fatcat/importer_common.py
+++ b/python/fatcat/importer_common.py
@@ -13,11 +13,15 @@ def grouper(iterable, n, fillvalue=None):
class FatcatImporter:
- def __init__(self, host_url):
+ def __init__(self, host_url, issn_map_file=None):
conf = fatcat_client.Configuration()
conf.host = host_url
self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf))
- self._issnl_map = dict()
+ self._issnl_id_map = dict()
+ self._orcid_id_map = dict()
+ self._issn_issnl_map = None
+ if issn_map_file:
+ self.read_issn_map_file(issn_map_file)
def process_source(self, source, group_size=100):
"""Creates and auto-accepts editgropu every group_size rows"""
@@ -40,8 +44,8 @@ class FatcatImporter:
def lookup_issnl(self, issnl):
"""Caches calls to the ISSN-L lookup API endpoint in a local dict"""
assert len(issnl) == 9 and issnl[4] == '-'
- if issnl in self._issnl_map:
- return self._issnl_map[issn]
+ if issnl in self._issnl_id_map:
+ return self._issnl_id_map[issn]
container_id = None
try:
rv = self.api.lookup_container(issnl=issnl)
@@ -49,5 +53,35 @@ class FatcatImporter:
except ApiException as ae:
# If anything other than a 404 (not found), something is wrong
assert ae.status == 404
- self._issnl_map[issnl] = container_id # might be None
+ self._issnl_id_map[issnl] = container_id # might be None
return container_id
+
+ def lookup_orcid(self, orcid):
+ """Caches calls to the Orcid lookup API endpoint in a local dict"""
+ assert len(orcid) == 19 and orcid[4] == '-'
+ if orcid in self._orcid_id_map:
+ return self._orcid_id_map[orcid]
+ creator_id = None
+ try:
+ rv = self.api.lookup_creator(orcid=orcid)
+ creator_id = rv.ident
+ except ApiException as ae:
+ # If anything other than a 404 (not found), something is wrong
+ assert ae.status == 404
+ self._orcid_id_map[orcid] = creator_id # might be None
+ return creator_id
+
+ def read_issn_map_file(self, issn_map_file):
+ self._issn_issnl_map = dict()
+ for line in issn_map_file:
+ if line.startswith("ISSN") or len(line) == 0:
+ continue
+ (issn, issnl) = line.split()[0:2]
+ self._issn_issnl_map[issn] = issnl
+ # double mapping makes lookups easy
+ self._issn_issnl_map[issnl] = issnl
+
+ def issn2issnl(self, issn):
+ if issn is None:
+ return None
+ self._issn_issnl_map.get(issn)