more progress on crossref+orcid importers

author: Bryan Newbold <bnewbold@robocracy.org> 2018-06-20 17:54:54 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2018-06-20 17:54:54 -0700
commit: 43e74c2e81c64d6d4f4e644cc5a6f75945ff660d (patch)
tree: 769cae1607f3b8b9fff43fce99028bda571c2145 /python/fatcat/crossref_importer.py
parent: 381fe70c56b1a936d4eef676ee8ba546f6a3cf30 (diff)
download: fatcat-43e74c2e81c64d6d4f4e644cc5a6f75945ff660d.tar.gz
fatcat-43e74c2e81c64d6d4f4e644cc5a6f75945ff660d.zip
1 files changed, 45 insertions, 40 deletions
diff --git a/python/fatcat/crossref_importer.py b/python/fatcat/crossref_importer.py
index 4c68230d..a7166bc3 100644
--- a/python/fatcat/crossref_importer.py
+++ b/python/fatcat/crossref_importer.py
@@ -8,7 +8,9 @@ from fatcat.importer_common import FatcatImporter
 
 class FatcatCrossrefImporter(FatcatImporter):
 
-    # TODO: overload __init__ to handle create_containers
+    def __init__(self, host_url, issn_map_file, create_containers=True):
+        super().__init__(host_url, issn_map_file)
+        self.create_containers = create_containers
 
     def parse_crossref_dict(self, obj):
         """
@@ -19,50 +21,49 @@ class FatcatCrossrefImporter(FatcatImporter):
         # contribs
         contribs = []
         for i, am in enumerate(obj['author']):
+            creator_id = None
+            if 'ORCID' in am.keys():
+                creator_id = self.lookup_orcid(am['ORCID'].split('/')[-1])
             contribs.append(fatcat_client.ReleaseContrib(
-                creator_id=None, # TODO: orcid lookup
-                index=i,
+                creator_id=creator_id,
+                index=i+1,
                 # Sorry humans :(
                 raw="{} {}".format(am['given'], am['family']),
                 role="author"))
 
         # container
-        # TODO: ISSN vs. ISSN-L
         issn = obj.get('ISSN', [None])[0]
-        container_id = self.lookup_issnl(issn)
+        issnl = self.issn2issnl(issn)
+        container_id = None
+        if issnl:
+            container_id = self.lookup_issnl(issnl)
+        publisher = obj['publisher']
 
-        ## TODO: create containers in-line like this?
-        #container = dict(
-        #    issn=issn,
-        #    name=obj['container-title'][0],
-        #    container=container_id,
-        #    #sortname=obj['short-container-title'][0])
-        #    publisher=obj['publisher'])
-        #if container_id is None and self.create_containers and issn != None:
-        #    rv = self.post('/v0/container', data=dict(
-        #        issn=container['issn'],
-        #        publisher=container['publisher']))
-        #    assert rv.status_code == 201
-        #    container_id = rv.json()['id']
-        #    print("created container: {}".format(issn))
-        #    container['id'] = container_id
-        #    self._issn_map[issn] = container_id
+        ce = None
+        if container_id is None and self.create_containers and issnl != None:
+            ce = fatcat_client.ContainerEntity(
+                issnl=issnl,
+                publisher=publisher,
+                name=obj['container-title'][0])
+            print("created container: {}".format(issnl))
 
         # references
         refs = []
         for i, rm in enumerate(obj.get('reference', [])):
+            try:
+                year = int(rm.get('year'))
+            except:
+                year = None
             refs.append(fatcat_client.ReleaseRef(
-                index=i,
+                index=i+1,
                 target_release_id=None, # TODO: DOI lookup: rm.get("DOI", None),
-                # TODO: all these
-                key=None,
-                year=None,
-                container_title=None,
-                title=None,
-                locator=None,
-                # TODO: how to generate a proper stub here from k/v objdata?
-                # TODO: just dump JSON here if we didn't get a match?
-                raw="| ".join(rm.values())))
+                # unreliable for crossref: key=rm['key'].split('|')[-1],
+                year=year,
+                container_title=rm.get('volume-title'),
+                title=rm.get('title'),
+                locator=rm.get('first-page'),
+                # TODO: just dump JSON somewhere here?
+                raw=rm.get('unstructured')))
 
         # work
         we = fatcat_client.WorkEntity(
@@ -73,34 +74,38 @@ class FatcatCrossrefImporter(FatcatImporter):
         extra = dict(crossref={
             'links': obj.get('link', []),
             'subject': obj.get('subject'),
-            'crossref-type': obj['type'],
+            'type': obj['type'],
+            'license': obj.get('license', [dict(URL=None)])[0]['URL'] or None,
             'alternative-id': obj.get('alternative-id', [])})
 
         re = fatcat_client.ReleaseEntity(
-            work_id='null', # XXX:
+            work_id='tbd', # gets set later, I promise!
             title=obj['title'][0],
             contribs=contribs,
             refs=refs,
             container_id=container_id,
             release_type=obj['type'],
-            doi=obj['DOI'],
+            doi=obj['DOI'].lower(),
             release_date=obj['created']['date-time'],
-            #license=obj.get('license', [dict(URL=None)])[0]['URL'] or None,
             issue=obj.get('issue'),
             volume=obj.get('volume'),
             pages=obj.get('page'),
             extra=extra)
-        return (we, re)
+        return (we, re, ce)
 
     def create_row(self, row, editgroup_id=None):
         if row is None:
-            continue
+            return
         obj = json.loads(row)
-        both = self.parse_crossref_dict(obj)
-        if both is not None:
-            (we, re) = both
+        entities = self.parse_crossref_dict(obj)
+        if entities is not None:
+            (we, re, ce) = entities
             we.editgroup_id = editgroup_id
             re.editgroup_id = editgroup_id
+            if ce is not None:
+                ce.editgroup_id = editgroup_id
+                container = self.api.create_container(ce)
+                re.container_id = container.ident
             created = self.api.create_work(we)
             re.work_id = created.ident
             self.api.create_release(re)
author	Bryan Newbold <bnewbold@robocracy.org>	2018-06-20 17:54:54 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2018-06-20 17:54:54 -0700
commit	43e74c2e81c64d6d4f4e644cc5a6f75945ff660d (patch)
tree	769cae1607f3b8b9fff43fce99028bda571c2145 /python/fatcat/crossref_importer.py
parent	381fe70c56b1a936d4eef676ee8ba546f6a3cf30 (diff)
download	fatcat-43e74c2e81c64d6d4f4e644cc5a6f75945ff660d.tar.gz fatcat-43e74c2e81c64d6d4f4e644cc5a6f75945ff660d.zip