datacite: improve date handling and minor tweak

Records from https://www.micropublication.org/ did not have a date in FC, although raw data contained date strings - they were not using the finer-grained "attributes.date" but "attributes.published" and/or "attributes.publicationYear". Support for those fields has been added, including a test case. During this test (#30) a processing gap for names became clear (author may have "given_name" and "surname", but no "name"). This bug has been fixed, too.
author: Martin Czygan <martin.czygan@gmail.com> 2020-01-30 13:36:01 +0100
committer: Martin Czygan <martin.czygan@gmail.com> 2020-01-30 13:36:01 +0100
commit: 7dec2d1560ebf5ca6d0d337eb246fe345f6ec0bb (patch)
tree: 7acfda698ff56ce2e9690a4026fbc212fd411895 /python
parent: 55a4f211532c93d8164b0d4719dc0413005941ea (diff)
download: fatcat-7dec2d1560ebf5ca6d0d337eb246fe345f6ec0bb.tar.gz
fatcat-7dec2d1560ebf5ca6d0d337eb246fe345f6ec0bb.zip
4 files changed, 153 insertions, 21 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 2f77481a..15a10cdb 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -311,6 +311,17 @@ class DataciteImporter(EntityImporter):
         release_date, release_month, release_year = parse_datacite_dates(
             attributes.get('dates', []))
 
+        # Some records do not use the "dates" field (e.g. micropub), but:
+        # "attributes.published" or "attributes.publicationYear"
+        if not any((release_date, release_month, release_year)):
+            release_date, release_month, release_year = parse_single_date(attributes.get('publicationYear'))
+            if not any((release_date, release_month, release_year)):
+                release_date, release_month, release_year = parse_single_date(attributes.get('published'))
+
+        if not any((release_date, release_month, release_year)):
+            print('[{}] skipping record w/o date: {}'.format(doi, obj), file=sys.stderr)
+            return False
+
         # Start with clear stages, e.g. published. TODO(martin): we could
         # probably infer a bit more from the relations, e.g.
         # "IsPreviousVersionOf" or "IsNewVersionOf".
@@ -490,7 +501,7 @@ class DataciteImporter(EntityImporter):
             if len(text) > MAX_ABSTRACT_LENGTH:
                 text = text[:MAX_ABSTRACT_LENGTH] + " [...]"
 
-            # Detect language.
+            # Detect language. This is fuzzy and may be removed, if too unreliable.
             lang = None
             try:
                 lang = langdetect.detect(text)
@@ -719,8 +730,10 @@ class DataciteImporter(EntityImporter):
 
                 if name:
                     name = clean(name)
-                if not name:
+                if not any((name, given_name, surname)):
                     continue
+                if not name:
+                    name = "{} {}".format(given_name, surname).strip()
                 if name in name_blacklist:
                     continue
                 if name.lower() in UNKNOWN_MARKERS_LOWER:
@@ -924,6 +937,32 @@ def parse_datacite_titles(titles):
 
     return title, original_language_title, subtitle
 
+def parse_single_date(value):
+    """
+    Given a single string containing a date in arbitrary format, try to return
+    tuple (date: datetime.date, month: int, year: int).
+    """
+    if not value:
+        return None, None, None
+    if isinstance(value, int):
+        value = str(value)
+    parser = dateparser.DateDataParser()
+    try:
+        # Results in a dict with keys: date_obj, period, locale.
+        parse_result = parser.get_date_data(value)
+        # A datetime object, later we need a date, only.
+        result = parse_result['date_obj']
+        if result is not None:
+            if parse_result['period'] == 'year':
+                return None, None, result.year
+            elif parse_result['period'] == 'month':
+                return None, result.month, result.year
+            else:
+                return result.date(), result.month, result.year
+    except TypeError as err:
+        print("{} date parsing failed with: {}".format(value, err), file=sys.stderr)
+
+    return None, None, None
 
 def parse_datacite_dates(dates):
     """
@@ -981,23 +1020,7 @@ def parse_datacite_dates(dates):
 
         if result is None:
             print('fallback for {}'.format(value), file=sys.stderr)
-            parser = dateparser.DateDataParser()
-            try:
-                # Results in a dict with keys: date_obj, period, locale.
-                parse_result = parser.get_date_data(value)
-
-                # A datetime object, later we need a date, only.
-                result = parse_result['date_obj']
-                if result is not None:
-                    if parse_result['period'] == 'year':
-                        return None, None, result.year
-                    elif parse_result['period'] == 'month':
-                        return None, result.month, result.year
-                    else:
-                        return result.date(), result.month, result.year
-            except TypeError as err:
-                print("{} date parsing failed with: {}".format(value, err),
-                      file=sys.stderr)
+            release_date, release_month, release_year = parse_single_date(value)
 
         if result is None:
             # Unparsable date.
diff --git a/python/tests/files/datacite/datacite_doc_30.json b/python/tests/files/datacite/datacite_doc_30.json
new file mode 100644
index 00000000..5f851bbb
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_30.json
@@ -0,0 +1,72 @@
+{
+  "id": "10.17912/micropub.biology.000143",
+  "type": "dois",
+  "attributes": {
+    "doi": "10.17912/micropub.biology.000143",
+    "identifiers": null,
+    "creators": [
+      {
+        "raw_name": "Celja J Uebel",
+        "givenName": "Celja J",
+        "familyName": "Uebel",
+        "affiliation": [],
+        "role": "author"
+      },
+      {
+        "raw_name": "Carolyn M Phillips",
+        "givenName": "Carolyn M",
+        "familyName": "Phillips",
+        "affiliation": [],
+        "role": "author"
+      }
+    ],
+    "titles": [
+      {
+        "title": "Phase-separated protein dynamics are affected by fluorescent tag choice"
+      }
+    ],
+    "publisher": "microPublication Biology",
+    "container": {},
+    "publicationYear": 2019,
+    "subjects": [],
+    "contributors": [],
+    "dates": null,
+    "language": null,
+    "types": {
+      "resourceTypeGeneral": "DataPaper"
+    },
+    "relatedIdentifiers": [],
+    "sizes": [],
+    "formats": [],
+    "version": null,
+    "rightsList": [],
+    "descriptions": [
+      {
+        "description": "Biological liquid-liquid phase separation",
+        "descriptionType": "Abstract"
+      }
+    ],
+    "geoLocations": [],
+    "fundingReferences": [],
+    "url": "https://www.micropublication.org/journals/biology/micropub.biology.000143",
+    "contentUrl": null,
+    "metadataVersion": 0,
+    "schemaVersion": null,
+    "source": null,
+    "isActive": true,
+    "state": "findable",
+    "reason": null,
+    "created": "2019-08-19T14:43:08.000Z",
+    "registered": "2019-08-19T14:43:09.000Z",
+    "published": "2019",
+    "updated": "2019-11-09T12:32:02.000Z"
+  },
+  "relationships": {
+    "client": {
+      "data": {
+        "id": "caltech.micropub",
+        "type": "clients"
+      }
+    }
+  }
+}
diff --git a/python/tests/files/datacite/datacite_result_30.json b/python/tests/files/datacite/datacite_result_30.json
new file mode 100644
index 00000000..f7d1bb2c
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_30.json
@@ -0,0 +1,38 @@
+{
+  "abstracts": [
+    {
+      "content": "Biological liquid-liquid phase separation",
+      "lang": "fr",
+      "mimetype": "text/plain"
+    }
+  ],
+  "contribs": [
+    {
+      "index": 0,
+      "given_name": "Celja J",
+      "surname": "Uebel",
+      "raw_name": "Celja J Uebel",
+      "role": "author"
+    },
+    {
+      "index": 1,
+      "given_name": "Carolyn M",
+      "raw_name": "Carolyn M Phillips",
+      "surname": "Phillips",
+      "role": "author"
+    }
+  ],
+  "ext_ids": {
+    "doi": "10.17912/micropub.biology.000143"
+  },
+  "extra": {
+    "datacite": {
+      "resourceTypeGeneral": "DataPaper"
+    }
+  },
+  "refs": [],
+  "release_stage": "published",
+  "release_year": 2019,
+  "publisher": "microPublication Biology",
+  "title": "Phase-separated protein dynamics are affected by fluorescent tag choice"
+}
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 669a6984..15650375 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -287,10 +287,9 @@ def test_datacite_conversions(datacite_importer):
     for now.
     """
     datacite_importer.debug = True
-    for i in range(30):
+    for i in range(31):
         src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i)
         dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i)
-        print('testing mapping from {} => {}'.format(src, dst))
         with open(src, 'r') as f:
             re = datacite_importer.parse_record(json.load(f))
             result = entity_to_dict(re)
author	Martin Czygan <martin.czygan@gmail.com>	2020-01-30 13:36:01 +0100
committer	Martin Czygan <martin.czygan@gmail.com>	2020-01-30 13:36:01 +0100
commit	7dec2d1560ebf5ca6d0d337eb246fe345f6ec0bb (patch)
tree	7acfda698ff56ce2e9690a4026fbc212fd411895 /python
parent	55a4f211532c93d8164b0d4719dc0413005941ea (diff)
download	fatcat-7dec2d1560ebf5ca6d0d337eb246fe345f6ec0bb.tar.gz fatcat-7dec2d1560ebf5ca6d0d337eb246fe345f6ec0bb.zip