diff options
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 45 | ||||
-rw-r--r-- | python/tests/import_datacite.py | 23 |
2 files changed, 51 insertions, 17 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 45c8a421..5891f8de 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -9,6 +9,7 @@ functions (parse_datacite_...), which can be tested more easily. """ from .common import EntityImporter, clean +import collections import dateparser import datetime import fatcat_openapi_client @@ -783,43 +784,68 @@ def parse_datacite_dates(dates): 'Updated', ) + # We need to note the granularity, since a string like "2019" would be + # parsed into "2019-01-01", even though the month is unknown. Use 3 + # granularity types: 'y', 'm', 'd'. + Pattern = collections.namedtuple('Pattern', 'layout granularity') + # Before using (expensive) dateparser, try a few common patterns. - common_patterns = ('%Y-%m-%d', '%Y-%m', '%Y-%m-%dT%H:%M:%SZ', - '%Y-%m-%dT%H:%M:%S', '%Y') + common_patterns = ( + Pattern('%Y-%m-%d', 'd'), + Pattern('%Y-%m', 'm'), + Pattern('%Y-%m-%dT%H:%M:%SZ', 'd'), + Pattern('%Y-%m-%dT%H:%M:%S', 'd'), + Pattern('%Y', 'y'), + ) def parse_item(item): result, value, year_only = None, item.get('date', ''), False release_date, release_month, release_year = None, None, None - for pattern in common_patterns: + for layout, granularity in common_patterns: try: - result = datetime.datetime.strptime(value, pattern) + result = datetime.datetime.strptime(value, layout) except ValueError: continue else: - if pattern == '%Y': + if granularity == 'y': year_only = True break if result is None: print('fallback for {}'.format(value), file=sys.stderr) + parser = dateparser.DateDataParser() try: - result = dateparser.parse(value) + # Results in a dict with keys: date_obj, period, locale. + parse_result = parser.get_date_data(value) + + # A datetime object, later we need a date, only. + result = parse_result['date_obj'] + if result is not None: + if parse_result['period'] == 'year': + return None, None, result.year + elif parse_result['period'] == 'month': + return None, result.month, result.year + else: + return result.date(), result.month, result.year except TypeError as err: print("{} date parsing failed with: {}".format(value, err), file=sys.stderr) - return result_date, release_month, result_year if result is None: # Unparsable date. return release_date, release_month, release_year - if not year_only: + if granularity != 'y': release_date = result.date() release_year = result.year + if granularity in ('m', 'd'): + release_month = result.month return release_date, release_month, release_year + today = datetime.date.today() + for prio in date_type_prio: for item in dates: if not item.get('dateType') == prio: @@ -829,8 +855,7 @@ def parse_datacite_dates(dates): if release_date is None and release_year is None: continue - if release_year < 1000 or release_year > datetime.date.today( - ).year + 5: + if release_year < 1000 or release_year > today.year + 5: # Skip possibly bogus dates. release_year = None continue diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 29c608ee..c2fcdec9 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -173,7 +173,7 @@ def test_parse_datacite_dates(): Case('None is None', None, (None, None, None)), Case('empty list is None', [], (None, None, None)), Case('empty item is None', [{}], (None, None, None)), - Case('empty item is None', [{'date': '2019'}], (None, None, 2019)), + Case('year only yields year only', [{'date': '2019'}], (None, None, 2019)), Case('first wins', [{'date': '2019'}, {'date': '2020'}], (None, None, 2019)), Case('skip bogus year', [{'date': 'abc'}, {'date': '2020'}], (None, None, 2020)), Case('first with type', [ @@ -181,27 +181,36 @@ def test_parse_datacite_dates(): ], (None, None, 2019)), Case('full date', [ {'date': '2019-12-01', 'dateType': 'Valid'}, - ], (datetime.date(2019, 12, 1), None, 2019)), + ], (datetime.date(2019, 12, 1), 12, 2019)), Case('date type prio', [ {'date': '2000-12-01', 'dateType': 'Valid'}, {'date': '2010-01-01', 'dateType': 'Updated'}, - ], (datetime.date(2000, 12, 1), None, 2000)), + ], (datetime.date(2000, 12, 1), 12, 2000)), Case('date type prio, Available > Updated', [ {'date': '2010-01-01', 'dateType': 'Updated'}, {'date': '2000-12-01', 'dateType': 'Available'}, - ], (datetime.date(2000, 12, 1), None, 2000)), + ], (datetime.date(2000, 12, 1), 12, 2000)), Case('allow different date formats, Available > Updated', [ {'date': '2010-01-01T10:00:00', 'dateType': 'Updated'}, {'date': '2000-12-01T10:00:00', 'dateType': 'Available'}, - ], (datetime.date(2000, 12, 1), None, 2000)), + ], (datetime.date(2000, 12, 1), 12, 2000)), Case('allow different date formats, Available > Updated', [ {'date': '2010-01-01T10:00:00Z', 'dateType': 'Updated'}, {'date': '2000-12-01T10:00:00Z', 'dateType': 'Available'}, - ], (datetime.date(2000, 12, 1), None, 2000)), + ], (datetime.date(2000, 12, 1), 12, 2000)), Case('allow fuzzy date formats, Available > Updated', [ {'date': '2010', 'dateType': 'Updated'}, {'date': '2000 Dec 01', 'dateType': 'Available'}, - ], (datetime.date(2000, 12, 1), None, 2000)), + ], (datetime.date(2000, 12, 1), 12, 2000)), + Case('fuzzy year only', [ + {'date': 'Year 2010', 'dateType': 'Issued'}, + ], (None, None, 2010)), + Case('fuzzy year and month', [ + {'date': 'Year 2010 Feb', 'dateType': 'Issued'}, + ], (None, 2, 2010)), + Case('fuzzy year, month, day', [ + {'date': 'Year 2010 Feb 24', 'dateType': 'Issued'}, + ], (datetime.date(2010, 2, 24), 2, 2010)), Case('ignore broken date', [ {'date': 'Febrrr 45', 'dateType': 'Updated'}, ], (None, None, None)), |