From 55dcece5a476b1492bf6c7f4597a469b48b41264 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 3 Jan 2020 22:40:53 +0100
Subject: datacite: parse_datacite_dates returns month

As [...] we will soon add support for release_month field in the release schema.
---
 python/fatcat_tools/importers/datacite.py | 45 ++++++++++++++++++++++++-------
 python/tests/import_datacite.py           | 23 +++++++++++-----
 2 files changed, 51 insertions(+), 17 deletions(-)

(limited to 'python')

diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 45c8a421..5891f8de 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -9,6 +9,7 @@ functions (parse_datacite_...), which can be tested more easily.
 """
 
 from .common import EntityImporter, clean
+import collections
 import dateparser
 import datetime
 import fatcat_openapi_client
@@ -783,43 +784,68 @@ def parse_datacite_dates(dates):
         'Updated',
     )
 
+    # We need to note the granularity, since a string like "2019" would be
+    # parsed into "2019-01-01", even though the month is unknown. Use 3
+    # granularity types: 'y', 'm', 'd'.
+    Pattern = collections.namedtuple('Pattern', 'layout granularity')
+
     # Before using (expensive) dateparser, try a few common patterns.
-    common_patterns = ('%Y-%m-%d', '%Y-%m', '%Y-%m-%dT%H:%M:%SZ',
-                       '%Y-%m-%dT%H:%M:%S', '%Y')
+    common_patterns = (
+        Pattern('%Y-%m-%d', 'd'),
+        Pattern('%Y-%m', 'm'),
+        Pattern('%Y-%m-%dT%H:%M:%SZ', 'd'),
+        Pattern('%Y-%m-%dT%H:%M:%S', 'd'),
+        Pattern('%Y', 'y'),
+    )
 
     def parse_item(item):
         result, value, year_only = None, item.get('date', ''), False
         release_date, release_month, release_year = None, None, None
 
-        for pattern in common_patterns:
+        for layout, granularity in common_patterns:
             try:
-                result = datetime.datetime.strptime(value, pattern)
+                result = datetime.datetime.strptime(value, layout)
             except ValueError:
                 continue
             else:
-                if pattern == '%Y':
+                if granularity == 'y':
                     year_only = True
                 break
 
         if result is None:
             print('fallback for {}'.format(value), file=sys.stderr)
+            parser = dateparser.DateDataParser()
             try:
-                result = dateparser.parse(value)
+                # Results in a dict with keys: date_obj, period, locale.
+                parse_result = parser.get_date_data(value)
+
+                # A datetime object, later we need a date, only.
+                result = parse_result['date_obj']
+                if result is not None:
+                    if parse_result['period'] == 'year':
+                        return None, None, result.year
+                    elif parse_result['period'] == 'month':
+                        return None, result.month, result.year
+                    else:
+                        return result.date(), result.month, result.year
             except TypeError as err:
                 print("{} date parsing failed with: {}".format(value, err),
                       file=sys.stderr)
-                return result_date, release_month, result_year
 
         if result is None:
             # Unparsable date.
             return release_date, release_month, release_year
 
-        if not year_only:
+        if granularity != 'y':
             release_date = result.date()
         release_year = result.year
+        if granularity in ('m', 'd'):
+            release_month = result.month
 
         return release_date, release_month, release_year
 
+    today = datetime.date.today()
+
     for prio in date_type_prio:
         for item in dates:
             if not item.get('dateType') == prio:
@@ -829,8 +855,7 @@ def parse_datacite_dates(dates):
             if release_date is None and release_year is None:
                 continue
 
-            if release_year < 1000 or release_year > datetime.date.today(
-            ).year + 5:
+            if release_year < 1000 or release_year > today.year + 5:
                 # Skip possibly bogus dates.
                 release_year = None
                 continue
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 29c608ee..c2fcdec9 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -173,7 +173,7 @@ def test_parse_datacite_dates():
         Case('None is None', None, (None, None, None)),
         Case('empty list is None', [], (None, None, None)),
         Case('empty item is None', [{}], (None, None, None)),
-        Case('empty item is None', [{'date': '2019'}], (None, None, 2019)),
+        Case('year only yields year only', [{'date': '2019'}], (None, None, 2019)),
         Case('first wins', [{'date': '2019'}, {'date': '2020'}], (None, None, 2019)),
         Case('skip bogus year', [{'date': 'abc'}, {'date': '2020'}], (None, None, 2020)),
         Case('first with type', [
@@ -181,27 +181,36 @@ def test_parse_datacite_dates():
         ], (None, None, 2019)),
         Case('full date', [
             {'date': '2019-12-01', 'dateType': 'Valid'},
-        ], (datetime.date(2019, 12, 1), None, 2019)),
+        ], (datetime.date(2019, 12, 1), 12, 2019)),
         Case('date type prio', [
             {'date': '2000-12-01', 'dateType': 'Valid'},
             {'date': '2010-01-01', 'dateType': 'Updated'},
-        ], (datetime.date(2000, 12, 1), None, 2000)),
+        ], (datetime.date(2000, 12, 1), 12, 2000)),
         Case('date type prio, Available > Updated', [
             {'date': '2010-01-01', 'dateType': 'Updated'},
             {'date': '2000-12-01', 'dateType': 'Available'},
-        ], (datetime.date(2000, 12, 1), None, 2000)),
+        ], (datetime.date(2000, 12, 1), 12, 2000)),
         Case('allow different date formats, Available > Updated', [
             {'date': '2010-01-01T10:00:00', 'dateType': 'Updated'},
             {'date': '2000-12-01T10:00:00', 'dateType': 'Available'},
-        ], (datetime.date(2000, 12, 1), None, 2000)),
+        ], (datetime.date(2000, 12, 1), 12, 2000)),
         Case('allow different date formats, Available > Updated', [
             {'date': '2010-01-01T10:00:00Z', 'dateType': 'Updated'},
             {'date': '2000-12-01T10:00:00Z', 'dateType': 'Available'},
-        ], (datetime.date(2000, 12, 1), None, 2000)),
+        ], (datetime.date(2000, 12, 1), 12, 2000)),
         Case('allow fuzzy date formats, Available > Updated', [
             {'date': '2010', 'dateType': 'Updated'},
             {'date': '2000 Dec 01', 'dateType': 'Available'},
-        ], (datetime.date(2000, 12, 1), None, 2000)),
+        ], (datetime.date(2000, 12, 1), 12, 2000)),
+        Case('fuzzy year only', [
+            {'date': 'Year 2010', 'dateType': 'Issued'},
+        ], (None, None, 2010)),
+        Case('fuzzy year and month', [
+            {'date': 'Year 2010 Feb', 'dateType': 'Issued'},
+        ], (None, 2, 2010)),
+        Case('fuzzy year, month, day', [
+            {'date': 'Year 2010 Feb 24', 'dateType': 'Issued'},
+        ], (datetime.date(2010, 2, 24), 2, 2010)),
         Case('ignore broken date', [
             {'date': 'Febrrr 45', 'dateType': 'Updated'},
         ], (None, None, None)),
-- 
cgit v1.2.3