4 files changed, 162 insertions, 25 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 2f77481a..20fc399c 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -1,11 +1,11 @@
 """
 Prototype importer for datacite.org data.
 
-Example input document at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8.
+Example input document: https://api.datacite.org/dois/10.7916/d8-f93n-rk51
 
-Datacite being an aggregator, the data is varied and exposes a couple of
-problems in content and structure. A few fields habe their own parsing
-functions (parse_datacite_...), which can be tested more easily.
+Datacite being an aggregator, the data is heterogenous and exposes a couple of
+problems in content and structure. A few fields have their own parsing
+functions (parse_datacite_...), which may help testing.
 """
 
 import collections
@@ -311,6 +311,16 @@ class DataciteImporter(EntityImporter):
         release_date, release_month, release_year = parse_datacite_dates(
             attributes.get('dates', []))
 
+        # Some records do not use the "dates" field (e.g. micropub), but:
+        # "attributes.published" or "attributes.publicationYear"
+        if not any((release_date, release_month, release_year)):
+            release_date, release_month, release_year = parse_single_date(attributes.get('publicationYear'))
+            if not any((release_date, release_month, release_year)):
+                release_date, release_month, release_year = parse_single_date(attributes.get('published'))
+
+        if not any((release_date, release_month, release_year)):
+            print('[{}] record w/o date: {}'.format(doi, obj), file=sys.stderr)
+
         # Start with clear stages, e.g. published. TODO(martin): we could
         # probably infer a bit more from the relations, e.g.
         # "IsPreviousVersionOf" or "IsNewVersionOf".
@@ -380,6 +390,11 @@ class DataciteImporter(EntityImporter):
                                 len(container_name)))
                             container_name = container_name[0]
 
+        # Exception: https://www.micropublication.org/, see: !MR24.
+        if container_id is None and container_name is None:
+            if publisher and publisher.lower().startswith('micropublication'):
+                container_name = publisher
+
         # Volume and issue.
         volume = container.get('volume')
         issue = container.get('issue')
@@ -490,7 +505,7 @@ class DataciteImporter(EntityImporter):
             if len(text) > MAX_ABSTRACT_LENGTH:
                 text = text[:MAX_ABSTRACT_LENGTH] + " [...]"
 
-            # Detect language.
+            # Detect language. This is fuzzy and may be removed, if too unreliable.
             lang = None
             try:
                 lang = langdetect.detect(text)
@@ -719,8 +734,10 @@ class DataciteImporter(EntityImporter):
 
                 if name:
                     name = clean(name)
-                if not name:
+                if not any((name, given_name, surname)):
                     continue
+                if not name:
+                    name = "{} {}".format(given_name or '', surname or '').strip()
                 if name in name_blacklist:
                     continue
                 if name.lower() in UNKNOWN_MARKERS_LOWER:
@@ -924,6 +941,32 @@ def parse_datacite_titles(titles):
 
     return title, original_language_title, subtitle
 
+def parse_single_date(value):
+    """
+    Given a single string containing a date in arbitrary format, try to return
+    tuple (date: datetime.date, month: int, year: int).
+    """
+    if not value:
+        return None, None, None
+    if isinstance(value, int):
+        value = str(value)
+    parser = dateparser.DateDataParser()
+    try:
+        # Results in a dict with keys: date_obj, period, locale.
+        parse_result = parser.get_date_data(value)
+        # A datetime object, later we need a date, only.
+        result = parse_result['date_obj']
+        if result is not None:
+            if parse_result['period'] == 'year':
+                return None, None, result.year
+            elif parse_result['period'] == 'month':
+                return None, result.month, result.year
+            else:
+                return result.date(), result.month, result.year
+    except TypeError as err:
+        print("{} date parsing failed with: {}".format(value, err), file=sys.stderr)
+
+    return None, None, None
 
 def parse_datacite_dates(dates):
     """
@@ -981,23 +1024,7 @@ def parse_datacite_dates(dates):
 
         if result is None:
             print('fallback for {}'.format(value), file=sys.stderr)
-            parser = dateparser.DateDataParser()
-            try:
-                # Results in a dict with keys: date_obj, period, locale.
-                parse_result = parser.get_date_data(value)
-
-                # A datetime object, later we need a date, only.
-                result = parse_result['date_obj']
-                if result is not None:
-                    if parse_result['period'] == 'year':
-                        return None, None, result.year
-                    elif parse_result['period'] == 'month':
-                        return None, result.month, result.year
-                    else:
-                        return result.date(), result.month, result.year
-            except TypeError as err:
-                print("{} date parsing failed with: {}".format(value, err),
-                      file=sys.stderr)
+            release_date, release_month, release_year = parse_single_date(value)
 
         if result is None:
             # Unparsable date.
diff --git a/python/tests/files/datacite/datacite_doc_30.json b/python/tests/files/datacite/datacite_doc_30.json
new file mode 100644
index 00000000..5f851bbb
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_30.json
@@ -0,0 +1,72 @@
+{
+  "id": "10.17912/micropub.biology.000143",
+  "type": "dois",
+  "attributes": {
+    "doi": "10.17912/micropub.biology.000143",
+    "identifiers": null,
+    "creators": [
+      {
+        "raw_name": "Celja J Uebel",
+        "givenName": "Celja J",
+        "familyName": "Uebel",
+        "affiliation": [],
+        "role": "author"
+      },
+      {
+        "raw_name": "Carolyn M Phillips",
+        "givenName": "Carolyn M",
+        "familyName": "Phillips",
+        "affiliation": [],
+        "role": "author"
+      }
+    ],
+    "titles": [
+      {
+        "title": "Phase-separated protein dynamics are affected by fluorescent tag choice"
+      }
+    ],
+    "publisher": "microPublication Biology",
+    "container": {},
+    "publicationYear": 2019,
+    "subjects": [],
+    "contributors": [],
+    "dates": null,
+    "language": null,
+    "types": {
+      "resourceTypeGeneral": "DataPaper"
+    },
+    "relatedIdentifiers": [],
+    "sizes": [],
+    "formats": [],
+    "version": null,
+    "rightsList": [],
+    "descriptions": [
+      {
+        "description": "Biological liquid-liquid phase separation",
+        "descriptionType": "Abstract"
+      }
+    ],
+    "geoLocations": [],
+    "fundingReferences": [],
+    "url": "https://www.micropublication.org/journals/biology/micropub.biology.000143",
+    "contentUrl": null,
+    "metadataVersion": 0,
+    "schemaVersion": null,
+    "source": null,
+    "isActive": true,
+    "state": "findable",
+    "reason": null,
+    "created": "2019-08-19T14:43:08.000Z",
+    "registered": "2019-08-19T14:43:09.000Z",
+    "published": "2019",
+    "updated": "2019-11-09T12:32:02.000Z"
+  },
+  "relationships": {
+    "client": {
+      "data": {
+        "id": "caltech.micropub",
+        "type": "clients"
+      }
+    }
+  }
+}
diff --git a/python/tests/files/datacite/datacite_result_30.json b/python/tests/files/datacite/datacite_result_30.json
new file mode 100644
index 00000000..fc2c4dfc
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_30.json
@@ -0,0 +1,39 @@
+{
+  "abstracts": [
+    {
+      "content": "Biological liquid-liquid phase separation",
+      "lang": "fr",
+      "mimetype": "text/plain"
+    }
+  ],
+  "contribs": [
+    {
+      "index": 0,
+      "given_name": "Celja J",
+      "surname": "Uebel",
+      "raw_name": "Celja J Uebel",
+      "role": "author"
+    },
+    {
+      "index": 1,
+      "given_name": "Carolyn M",
+      "raw_name": "Carolyn M Phillips",
+      "surname": "Phillips",
+      "role": "author"
+    }
+  ],
+  "ext_ids": {
+    "doi": "10.17912/micropub.biology.000143"
+  },
+  "extra": {
+    "datacite": {
+      "resourceTypeGeneral": "DataPaper"
+    },
+    "container_name": "microPublication Biology"
+  },
+  "refs": [],
+  "release_stage": "published",
+  "release_year": 2019,
+  "publisher": "microPublication Biology",
+  "title": "Phase-separated protein dynamics are affected by fluorescent tag choice"
+}
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 669a6984..15650375 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -287,10 +287,9 @@ def test_datacite_conversions(datacite_importer):
     for now.
     """
     datacite_importer.debug = True
-    for i in range(30):
+    for i in range(31):
         src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i)
         dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i)
-        print('testing mapping from {} => {}'.format(src, dst))
         with open(src, 'r') as f:
             re = datacite_importer.parse_record(json.load(f))
             result = entity_to_dict(re)