crossref importer updates

author: Bryan Newbold <bnewbold@robocracy.org> 2019-01-22 16:27:27 -0800
committer: Bryan Newbold <bnewbold@robocracy.org> 2019-01-22 16:27:27 -0800
commit: 9ab88508ed710de9db06a27436042ac30a70676e (patch)
tree: 47c0c8b7a15eaf88eb7472944d21f4b328b4de94 /python/fatcat_tools/importers
parent: c6444a6ebee4a541735705e10885067e6d012df1 (diff)
download: fatcat-9ab88508ed710de9db06a27436042ac30a70676e.tar.gz
fatcat-9ab88508ed710de9db06a27436042ac30a70676e.zip
1 files changed, 78 insertions, 19 deletions
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 4b9199cd..8953dd82 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -32,6 +32,31 @@ CROSSREF_TYPE_MAP = {
     'standard': 'standard',
 }
 
+CONTAINER_TYPE_MAP = {
+    'article-journal': 'journal',
+    'paper-conference': 'conference',
+    'book': 'book-series',
+}
+
+# TODO:
+LICENSE_SLUG_MAP = {
+    "http://creativecommons.org/licenses/by/3.0/": "CC-BY",
+    "http://creativecommons.org/licenses/by/4.0/": "CC-BY",
+    "http://creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA",
+    "http://creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
+    "http://creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND",
+    "http://creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND",
+    "http://creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC",
+    "http://creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
+    "http://creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA",
+    "http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA",
+    "http://creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND",
+    "http://creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND",
+    "http://www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0",
+    # http://onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license
+    # http://www.springer.com/tdm doesn't seem like a license
+}
+
 class CrossrefImporter(FatcatImporter):
     """
     Importer for Crossref metadata.
@@ -66,17 +91,21 @@ class CrossrefImporter(FatcatImporter):
 
     def lookup_ext_ids(self, doi):
         if self.extid_map_db is None:
-            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None)
+            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
         row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
             [doi.lower()]).fetchone()
         if row is None:
-            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None)
+            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
         row = [str(cell or '') or None for cell in row]
         return dict(
             core_id=row[0],
             pmid=row[1],
             pmcid=row[2],
-            wikidata_qid=row[3])
+            wikidata_qid=row[3],
+            # TODO:
+            arxiv_id=None,
+            jstor_id=None,
+        )
 
     def map_release_type(self, crossref_type):
         return CROSSREF_TYPE_MAP.get(crossref_type)
@@ -98,6 +127,8 @@ class CrossrefImporter(FatcatImporter):
                 'book-track', 'proceedings-series'):
             return None
 
+        release_type = self.map_release_type(obj['type'])
+
         # lookup existing DOI
         existing_release = None
         if self.check_existing:
@@ -132,9 +163,13 @@ class CrossrefImporter(FatcatImporter):
                     index = i
                 else:
                     index = None
+                raw_affiliation = None
                 if am.get('affiliation'):
-                    # note: affiliation => affiliations
-                    extra['affiliations'] = am.get('affiliation')
+                    if len(am.get('affiliation')) > 0:
+                        raw_affiliation = am.get('affiliation')[0]['name']
+                    if len(am.get('affiliation')) > 1:
+                        # note: affiliation => affiliations
+                        extra['affiliations'] = [a['name'] for a in am.get('affiliation')[1:]]
                 if am.get('sequence') and am.get('sequence') != "additional":
                     extra['sequence'] = am.get('sequence')
                 if not extra:
@@ -144,6 +179,7 @@ class CrossrefImporter(FatcatImporter):
                     creator_id=creator_id,
                     index=index,
                     raw_name=raw_name,
+                    raw_affiliation=raw_affiliation,
                     role=ctype,
                     extra=extra))
             return contribs
@@ -165,8 +201,19 @@ class CrossrefImporter(FatcatImporter):
             ce = fatcat_client.ContainerEntity(
                 issnl=issnl,
                 publisher=publisher,
+                container_type=CONTAINER_TYPE_MAP.get(release_type),
                 name=obj['container-title'][0])
 
+        # license slug
+        license_slug = None
+        for l in obj.get('license', []):
+            if l['content-version'] not in ('vor', 'unspecified'):
+                continue
+            slug = LICENSE_SLUG_MAP.get(l['URL'])
+            if slug:
+                license_slug = slug
+                break
+
         # references
         refs = []
         for i, rm in enumerate(obj.get('reference', [])):
@@ -188,10 +235,14 @@ class CrossrefImporter(FatcatImporter):
             container_name = rm.get('volume-title')
             if not container_name:
                 container_name = rm.get('journal-title')
+            ref_locator = rm.get('first-page')
+            ref_title = rm.get('title')
+            if extra.get('DOI'):
+                extra['doi'] = extra['DOI']
             extra.pop('DOI', None)
             extra.pop('key', None)
             extra.pop('year', None)
-            extra.pop('volume-name', None)
+            extra.pop('volume-title', None)
             extra.pop('journal-title', None)
             extra.pop('title', None)
             extra.pop('first-page', None)
@@ -207,8 +258,8 @@ class CrossrefImporter(FatcatImporter):
                 key=key,
                 year=year,
                 container_name=container_name,
-                title=rm.get('title'),
-                locator=rm.get('first-page'),
+                title=ref_title,
+                locator=ref_locator,
                 # TODO: just dump JSON somewhere here?
                 extra=extra))
 
@@ -277,26 +328,32 @@ class CrossrefImporter(FatcatImporter):
 
         re = fatcat_client.ReleaseEntity(
             work_id=None,
-            title=obj.get('title', [None])[0],
-            contribs=contribs,
-            refs=refs,
             container_id=container_id,
-            publisher=publisher,
-            release_type=self.map_release_type(obj['type']),
+            title=obj.get('title', [None])[0],
+            original_title=obj.get('original-title', [None])[0],
+            release_type=release_type,
             release_status=release_status,
+            release_date=release_date,
+            release_year=release_year,
+            publisher=publisher,
             doi=obj['DOI'].lower(),
-            isbn13=isbn13,
-            core_id=extids['core_id'],
             pmid=extids['pmid'],
             pmcid=extids['pmcid'],
             wikidata_qid=extids['wikidata_qid'],
-            release_date=release_date,
-            release_year=release_year,
-            issue=obj.get('issue'),
+            isbn13=isbn13,
+            core_id=extids['core_id'],
+            arxiv_id=extids['arxiv_id'],
+            jstor_id=extids['jstor_id'],
             volume=obj.get('volume'),
+            issue=obj.get('issue'),
             pages=obj.get('page'),
+            language=None,  # crossref doesn't supply language info
+            license_slug=license_slug,
+            extra=dict(crossref=extra),
             abstracts=abstracts,
-            extra=dict(crossref=extra))
+            contribs=contribs,
+            refs=refs,
+        )
         return (re, ce)
 
     def create_row(self, row, editgroup_id=None):
@@ -304,6 +361,8 @@ class CrossrefImporter(FatcatImporter):
             return
         obj = json.loads(row)
         entities = self.parse_crossref_dict(obj)
+        # XXX:
+        print(entities)
         if entities is not None:
             (re, ce) = entities
             if ce is not None:
author	Bryan Newbold <bnewbold@robocracy.org>	2019-01-22 16:27:27 -0800
committer	Bryan Newbold <bnewbold@robocracy.org>	2019-01-22 16:27:27 -0800
commit	9ab88508ed710de9db06a27436042ac30a70676e (patch)
tree	47c0c8b7a15eaf88eb7472944d21f4b328b4de94 /python/fatcat_tools/importers
parent	c6444a6ebee4a541735705e10885067e6d012df1 (diff)
download	fatcat-9ab88508ed710de9db06a27436042ac30a70676e.tar.gz fatcat-9ab88508ed710de9db06a27436042ac30a70676e.zip