From 76d6d4d2de6580ae147e40c43c18f04cc48b62ec Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Tue, 17 Dec 2019 17:38:45 +0100
Subject: datacite: add missing mappings and notes

---
 python/fatcat_tools/importers/datacite.py | 441 ++++++++++++------------------
 1 file changed, 175 insertions(+), 266 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index e486ba90..4e117dde 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -1,5 +1,5 @@
 """
-WIP: Importer for datacite.org data.
+Prototype Importer for datacite.org data.
 
 Example doc at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8
 """
@@ -8,9 +8,11 @@ from .common import EntityImporter
 import dateparser
 import langcodes
 import datetime
+import langdetect
 import fatcat_openapi_client
 import json
 import sys
+import hashlib
 
 # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary
 CONTAINER_TYPE_MAP = {
@@ -20,40 +22,8 @@ CONTAINER_TYPE_MAP = {
 }
 
 # The docs/guide should be the cannonical home for these mappings; update there
-# first.
-#
-# > select count(*), release_type from release_rev group by release_type order by count(*) desc;
-#
-#   count   |   release_type
-# ----------+-------------------
-#  95030004 | article-journal
-#  13477878 | chapter
-#   5926811 | paper-conference
-#   2169642 | article
-#   1806415 | dataset
-#   1548614 | book
-#   1390304 |
-#    818351 | report
-#    815684 | entry
-#    307998 | standard
-#    297769 | thesis
-#    261426 | letter
-#    148093 | post
-#    122736 | editorial
-#     99225 | stub
-#     96219 | review-book
-#     22854 | peer_review
-#     19078 | interview
-#     16278 | article-newspaper
-#      3973 | speech
-#      3536 | legal_case
-#      2264 | abstract
-#      1626 | legislation
-#      1053 | retraction
-#        85 | component
-# (25 rows)
-#
-# Map various datacite type types to CSL-ish types. None means TODO or remove.
+# first.  Map various datacite type types to CSL-ish types. None means TODO or
+# remove.
 DATACITE_TYPE_MAP = {
     'ris': {
         'THES': 'thesis',
@@ -197,91 +167,17 @@ class DataciteImporter(EntityImporter):
 
     def parse_record(self, obj):
         """
-        TODO(martin): Map datacite to RE.
-
-        WIP, notes:
-
-        * Many subjects, should they end up in extra?
-        * attributes.creators and attributes.contributors
-
-        $ jq '.attributes.creators[]?.nameType?' datacite.500k | sort | uniq -c | sort -nr
-        3963663 "Personal"
-        289795 null
-        8892 "Organizational"
-
-        Shall we use issued, available?
-
-          {
-            "date": "2011-11-18",
-            "dateType": "Accepted"
-          },
-          {
-            "date": "2011-11-18",
-            "dateType": "Available"
-          },
-          {
-            "date": "2011-11-07",
-            "dateType": "Copyrighted"
-          },
-          {
-            "date": "2011-11-18",
-            "dateType": "Issued"
-          },
-          {
-            "date": "2011-11-07",
-            "dateType": "Issued"
-          }
-
-        TODO(martin): Quick analysis of dates and stages.
+        Mapping datacite JSON to ReleaseEntity.
         """
-
         if 'attributes' not in obj:
             return None
 
         attributes = obj['attributes']
 
-        # > Contributors
-        #
-        #  "attributes.creators[].contributorType": [
-        #    "author"
-        #  ],
-        #  "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": [
-        #    "LCNA",
-        #    "GND",
-        #    "email",
-        #    "NAF",
-        #    "OSF",
-        #    "RRID",
-        #    "ORCID",
-        #    "SCOPUS",
-        #    "NRCPID",
-        #    "schema.org",
-        #    "GRID",
-        #    "MGDS",
-        #    "VIAF",
-        #    "JACoW-ID"
-        #  ],
-        #
-        #    "https://orcid.org/0000-0002-9902-738X",
-        #    "http://jacow.org/JACoW-00001280",
-        #    "Wiebe_Peter",
-        #    "https://osf.io/https://osf.io/kjfuy/",
-        #    "http://www.viaf.org176549220",
-        #    "2239",
-        #    "Jeffries_Martin",
-        #    "https://orcid.org/0000-0002-1493-6630",
-        #    "0000-0002-6233-612X",
-        #
-        # "creators": [
-        #   {
-        #     "name": "Bögli, Hans",
-        #     "nameType": "Personal",
-        #     "givenName": "Hans",
-        #     "familyName": "Bögli",
-        #     "affiliation": []
-        #   }
-        # ],
-
+        # Contributors. Many nameIdentifierSchemes, we do not use yet:
+        # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": [
+        # "LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID", "SCOPUS",
+        # "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID" ],
         contribs = []
 
         for i, c in enumerate(attributes['creators']):
@@ -304,15 +200,8 @@ class DataciteImporter(EntityImporter):
                 surname=c.get('familyName'),
             ))
 
-        # > Title
-        #
-        #   "attributes.titles[].titleType": [
-        #     "AlternativeTitle",
-        #     "Other",
-        #     "Subtitle",
-        #     null,
-        #     "TranslatedTitle"
-        #   ],
+        # Title, may come with "attributes.titles[].titleType", like
+        # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"
         title, subtitle = None, None
 
         for entry in attributes.get('titles', []):
@@ -321,22 +210,13 @@ class DataciteImporter(EntityImporter):
             if entry.get('titleType') == 'Subtitle':
                 subtitle = entry.get('title').strip()
 
-        # > Dates
-        #
-        #  "attributes.dates[].dateType": [
-        #    "Accepted",
-        #    "Available"
-        #    "Collected",
-        #    "Copyrighted",
-        #    "Created",
-        #    "Issued",
-        #    "Submitted",
-        #    "Updated",
-        #    "Valid",
-        #  ],
-        #
-        # Different documents have different dates defined. Choose the topmost
-        # available from prio list.
+        # Dates. A few internal dates (registered, created, updated) and
+        # published (0..2554). We try to work with typed date list, in
+        # "attributes.dates[].dateType", values: "Accepted", "Available"
+        # "Collected", "Copyrighted", "Created", "Issued", "Submitted",
+        # "Updated", "Valid".
+        release_year, release_date = None, None
+
         date_type_prio = (
             'Valid',
             'Issued',
@@ -348,14 +228,16 @@ class DataciteImporter(EntityImporter):
             'Created',
             'Updated',
         )
-
-        release_year, release_date = None, None
         for prio in date_type_prio:
             dates = attributes.get('dates', []) or [] # Never be None.
             for item in dates:
                 if not item.get('dateType') == prio:
                     continue
-                result = dateparser.parse(item.get('date'))
+                try:
+                    result = dateparser.parse(item.get('date'))
+                except TypeError as err:
+                    print("{} failed with: {}".format(item.get('date'), err), file=sys.stderr)
+                    continue
                 if result is None:
                     # Unparsable date.
                     continue
@@ -369,56 +251,23 @@ class DataciteImporter(EntityImporter):
                 continue
             break
 
-        # > Publisher
-        #
-        # A few NA values. A few bogus values.
-        #
+        # Publisher. A few NA values. A few bogus values.
         publisher = attributes.get('publisher')
 
-        if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)'):
+        if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)', '(:none)'):
             publisher = None
         if publisher is not None and len(publisher) > 80:
-            # Arbitrary magic value, TODO(martin): better heuristic.
-            # Example: "ETH-Bibliothek Zürich, Bildarchiv / Fotograf: Feller,
-            # Elisabeth, Empfänger, Unbekannt, Fotograf / Fel_041033-RE / Unbekannt,
-            # Nutzungsrechte müssen durch den Nutzer abgeklärt werden",
-            # TODO(martin): log misses.
+            # Arbitrary magic value max length. TODO(martin): better heuristic,
+            # but factored out; first we have to log misses. Example:
+            # "ETH-Bibliothek Zürich, Bildarchiv / Fotograf: Feller,
+            # Elisabeth, Empfänger, Unbekannt, Fotograf / Fel_041033-RE /
+            # Unbekannt, Nutzungsrechte müssen durch den Nutzer abgeklärt
+            # werden"
             publisher = None
 
-        # > Container
-        #
-        # For the moment, only ISSN as container.
-        #
-        #    "container": {
-        #      "type": "Journal",
-        #      "issue": "8",
-        #      "title": "Angewandte Chemie International Edition",
-        #      "volume": "57",
-        #      "lastPage": "2080",
-        #      "firstPage": "2077",
-        #      "identifier": "14337851",
-        #      "identifierType": "ISSN"
-        #    },
-        #
-        #  "attributes.container.type": [
-        #    "DataRepository",
-        #    "Journal",
-        #    "Series",
-        #    "Book Series"
-        #  ],
-        #
-        #  "attributes.container.identifierType": [
-        #    "Handle",
-        #    "ISBN",
-        #    "LISSN",
-        #    "DOI",
-        #    "EISSN",
-        #    "URL",
-        #    "ISSN"
-        #  ],
-        #
-
+        # Container. For the moment, only ISSN as container.
         container_id = None
+
         container = attributes.get('container', {}) or {}
         if container.get('type') in CONTAINER_TYPE_MAP.keys():
             container_type = CONTAINER_TYPE_MAP.get(container['type'])
@@ -440,142 +289,202 @@ class DataciteImporter(EntityImporter):
                         container_id = ce_edit.ident
                         self._issnl_id_map[issnl] = container_id
 
-        # > License
-        #
-        # attributes.rightsList[].rightsUri
-        # attributes.rightsList[].rights
-        # attributes.rightsList[].lang
-        #
+        # Volume and issue.
+        volume = container.get('volume')
+        issue = container.get('issue')
+
+        # Pages.
+        pages = None
+
+        first_page = container.get('firstPage')
+        last_page = container.get('lastPage')
+
+        if first_page and last_page:
+            try:
+                int(first_page) < int(last_page)
+                pages = '{}-{}'.format(first_page, last_page)
+            except ValueError as err:
+                print(err, file=sys.stderr)
+                pass
+
+        if not pages and first_page:
+            pages = first_page
 
+        # License.
         license_slug = None
         license_extra = []
+
         for l in attributes.get('rightsList', []):
             slug = lookup_license_slug(l.get('rightsUri'))
             if slug:
                 license_slug = slug
             license_extra.append(l)
 
-        # > Release type.
-        #
-        # Datacite has some fine granular typing (e.g. "Supplementary
-        # Collection of Datasets", "Taxonomic treatment", "blog_entry", ...
-        #
-        # Additional, coarse: resourceTypeGeneral
-        #
-        #  "attributes.types.resourceTypeGeneral": [
-        #    "Image",
-        #    "Dataset",
-        #    "PhysicalObject",
-        #    "Collection",
-        #    "Text",
-        #    "Sound",
-        #    "InteractiveResource",
-        #    "Event",
-        #    "Software",
-        #    "Other",
-        #    "Workflow",
-        #    "Audiovisual"
-        #  ],
-        #  "attributes.types.citeproc": [
-        #    "dataset",
-        #    "chapter",
-        #    "article-journal",
-        #    "song",
-        #    "article",
-        #    "report",
-        #    "graphic",
-        #    "thesis",
-        #    "book"
-        #  ],
-        #
-        # There is RIS, also.
-
-        # attributes.types.resourceType contains too many things for now.
+        # Release type. Try to determine the release type from a variety of
+        # types supplied in datacite. The "attributes.types.resourceType"
+        # contains too many (176 in sample) things for now; citeproc may be the
+        # closest, but not always supplied.
         for typeType in ('citeproc', 'resourceTypeGeneral', 'schemaOrg', 'bibtex', 'ris'):
-            release_type = attributes.get('types', {}).get(typeType)
+            value = attributes.get('types', {}).get(typeType)
+            release_type = DATACITE_TYPE_MAP.get(value)
             if release_type is not None:
                 break
 
-        # TODO(martin): Skip unmapped release_type entirely?
         if release_type is None:
             print("datacite unmapped type: {}".format(release_type), file=sys.stderr)
 
-        # > Language.
-        # attributes.language
-
+        # Language values are varied ("ger", "es", "English", "ENG", "en-us",
+        # "other", ...). Try to crush it with langcodes: "It may sound to you
+        # like langcodes solves a pretty boring problem. At one level, that's
+        # right. Sometimes you have a boring problem, and it's great when a
+        # library solves it for you." -- TODO(martin): We need more of these.
         language = None
-        value = attributes.get('language', '') or '' # As it is written.
+
+        value = attributes.get('language', '') or ''
         try:
             language = langcodes.find(value).language
         except LookupError:
             try:
                 language = langcodes.get(value).language
             except langcodes.tag_parser.LanguageTagError:
+                print('could not determine language: {}'.format(value), file=sys.stderr)
+
+        # Abstracts appear in "attributes.descriptions[].descriptionType", some
+        # of the observed values: "Methods", "TechnicalInfo",
+        # "SeriesInformation", "Other", "TableOfContents", "Abstract". The
+        # "Other" fields might contain references or related articles (with
+        # DOI). TODO(martin): maybe try to parse out some of those refs.
+        abstracts = []
+
+        for desc in attributes.get('descriptions', []):
+            if not desc.get('descriptionType') == 'Abstract':
+                continue
+            if len(desc.get('description', '')) < 10:
+                continue
+            text = desc.get('description')
+            sha1 = hashlib.sha1(text.encode('utf-8')).hexdigest()
+            lang = None
+            try:
+                lang = langdetect.detect(text)
+            except langdetect.lang_detect_exception.LangDetectException:
                 pass
+            abstracts.append(fatcat_openapi_client.ReleaseAbstract(
+                mimetype="text/plain",
+                content=text,
+                sha1=sha1,
+                lang=lang,
+            ))
+
+        # References and relations. Datacite include many relation types in
+        # "attributes.relatedIdentifiers[].relationType", e.g.
+        # "IsPartOf", "IsPreviousVersionOf", "Continues", "IsVariantFormOf",
+        # "IsSupplementTo", "Cites", "IsSupplementedBy", "IsDocumentedBy", "HasVersion",
+        # "IsCitedBy", "IsMetadataFor", "IsNewVersionOf", "IsIdenticalTo", "HasPart",
+        # "References", "Reviews", "HasMetadata", "IsContinuedBy", "IsVersionOf",
+        # "IsDerivedFrom", "IsSourceOf".
+        #
+        # For the moment, we only care about References.
+        refs, ref_index = [], 0
+
+        for rel in attributes.get('relatedIdentifiers', []):
+            if not rel.get('relationType') == 'References':
+                continue
+            ref_extra = dict()
+            if rel.get('relatedIdentifierType') == 'DOI':
+                ref_extra['doi'] = rel.get('relatedIdentifier')
+            if not ref_extra:
+                ref_extra = None
+            refs.append(fatcat_openapi_client.ReleaseRef(
+                index=ref_index,
+                extra=ref_extra,
+            ))
+            ref_index += 1
+
+        # Start with clear stages, e.g. published. TODO(martin): we could
+        # probably infer a bit more from the relations, e.g.
+        # "IsPreviousVersionOf" or "IsNewVersionOf".
+        release_stage = None
+        if attributes.get('state') == 'findable' or attributes.get('isActive') is True:
+            release_stage = 'published'
+
+        # Extra information.
+        extra_datacite = dict()
 
-        # > Extra information: license, subjects, ...
-        extra, extra_datacite = dict(), dict()
         if license_extra:
-            extra_datacite = {
-                'license': license_extra,
-            }
+            extra_datacite['license'] = license_extra
         if attributes.get('subjects'):
-            extra_datacite['subjects'] = attributes.get('subjects', [])
+            extra_datacite['subjects'] = attributes['subjects']
+        if attributes.get('url'):
+            extra_datacite['url'] = attributes['url']
+
+        extra = dict()
 
         if extra_datacite:
             extra['datacite'] = extra_datacite
 
-        # https://guide.fatcat.wiki/entity_release.html
+        # Assemble release.
         re = fatcat_openapi_client.ReleaseEntity(
             work_id=None,
             container_id=container_id,
             release_type=release_type,
-            release_stage=None,
-            title=title, # attributes.titles, various titleType
+            release_stage=release_stage,
+            title=title,
             subtitle=subtitle,
-            original_title=title, # AlternativeTitle?
-            release_year=release_year, # publicationYear
-            release_date=release_date, # date issues/available?
-            publisher=publisher, # attributes.publisher
+            original_title=title,
+            release_year=release_year,
+            release_date=release_date,
+            publisher=publisher,
             ext_ids=fatcat_openapi_client.ReleaseExtIds(
-                doi=attributes.get('doi'), # attributes.doi,
-                # Can we add handle.net link?
+                doi=attributes.get('doi'),
             ),
             contribs=contribs,
-            volume=None,
-            issue=None,
-            pages=None,
+            volume=volume,
+            issue=issue,
+            pages=pages,
             language=language,
-            abstracts=None,
-            refs=None,
+            abstracts=abstracts,
+            refs=refs,
             extra=extra,
             license_slug=license_slug,
         )
         return re
 
     def try_update(self, re, debug=True):
+        """
+        When debug is true, write the RE to stdout.
+        """
         if debug is True:
-            # print(type(re))
-            print(json.dumps(re.to_dict(), default=extended_encoder))
-            return
-        return False
+            print(json.dumps(re.to_dict(), default=extended_json_encoder))
+            return False
 
-    def insert_batch(self, batch):
-        # Debugging.
-        for item in batch:
-            print(item)
-        return
+        # lookup existing DOI (don't need to try other ext idents for crossref)
+        existing = None
+        try:
+            existing = self.api.lookup_release(doi=re.ext_ids.doi)
+        except fatcat_openapi_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+            # doesn't exist, need to update
+            return True
 
-        # Orig.
+        # eventually we'll want to support "updates", but for now just skip if
+        # entity already exists
+        if existing:
+            self.counts['exists'] += 1
+            return False
+
+        return True
+
+    def insert_batch(self, batch):
         self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
             editgroup=fatcat_openapi_client.Editgroup(
                 description=self.editgroup_description,
                 extra=self.editgroup_extra),
             entity_list=batch))
 
-def extended_encoder(value):
+def extended_json_encoder(value):
     """
-    Can be used with json.dumps(value, default=extended_encoder) to serialize
+    Can be used with json.dumps(value, default=extended_json_encoder) to serialize
     value not serializable by default. https://docs.python.org/3/library/json.html#basic-usage
     """
     if isinstance(value, (datetime.datetime, datetime.date)):
@@ -585,7 +494,7 @@ def extended_encoder(value):
 
 def lookup_license_slug(raw):
     """
-    TODO(martin): reuse from crossref, maybe.
+    TODO(martin): reuse from or combine with crossref, maybe.
     """
     if not raw:
         return None
-- 
cgit v1.2.3