summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/fatcat_tools/importers/datacite.py45
-rw-r--r--python/tests/import_datacite.py23
2 files changed, 51 insertions, 17 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 45c8a421..5891f8de 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -9,6 +9,7 @@ functions (parse_datacite_...), which can be tested more easily.
"""
from .common import EntityImporter, clean
+import collections
import dateparser
import datetime
import fatcat_openapi_client
@@ -783,43 +784,68 @@ def parse_datacite_dates(dates):
'Updated',
)
+ # We need to note the granularity, since a string like "2019" would be
+ # parsed into "2019-01-01", even though the month is unknown. Use 3
+ # granularity types: 'y', 'm', 'd'.
+ Pattern = collections.namedtuple('Pattern', 'layout granularity')
+
# Before using (expensive) dateparser, try a few common patterns.
- common_patterns = ('%Y-%m-%d', '%Y-%m', '%Y-%m-%dT%H:%M:%SZ',
- '%Y-%m-%dT%H:%M:%S', '%Y')
+ common_patterns = (
+ Pattern('%Y-%m-%d', 'd'),
+ Pattern('%Y-%m', 'm'),
+ Pattern('%Y-%m-%dT%H:%M:%SZ', 'd'),
+ Pattern('%Y-%m-%dT%H:%M:%S', 'd'),
+ Pattern('%Y', 'y'),
+ )
def parse_item(item):
result, value, year_only = None, item.get('date', ''), False
release_date, release_month, release_year = None, None, None
- for pattern in common_patterns:
+ for layout, granularity in common_patterns:
try:
- result = datetime.datetime.strptime(value, pattern)
+ result = datetime.datetime.strptime(value, layout)
except ValueError:
continue
else:
- if pattern == '%Y':
+ if granularity == 'y':
year_only = True
break
if result is None:
print('fallback for {}'.format(value), file=sys.stderr)
+ parser = dateparser.DateDataParser()
try:
- result = dateparser.parse(value)
+ # Results in a dict with keys: date_obj, period, locale.
+ parse_result = parser.get_date_data(value)
+
+ # A datetime object, later we need a date, only.
+ result = parse_result['date_obj']
+ if result is not None:
+ if parse_result['period'] == 'year':
+ return None, None, result.year
+ elif parse_result['period'] == 'month':
+ return None, result.month, result.year
+ else:
+ return result.date(), result.month, result.year
except TypeError as err:
print("{} date parsing failed with: {}".format(value, err),
file=sys.stderr)
- return result_date, release_month, result_year
if result is None:
# Unparsable date.
return release_date, release_month, release_year
- if not year_only:
+ if granularity != 'y':
release_date = result.date()
release_year = result.year
+ if granularity in ('m', 'd'):
+ release_month = result.month
return release_date, release_month, release_year
+ today = datetime.date.today()
+
for prio in date_type_prio:
for item in dates:
if not item.get('dateType') == prio:
@@ -829,8 +855,7 @@ def parse_datacite_dates(dates):
if release_date is None and release_year is None:
continue
- if release_year < 1000 or release_year > datetime.date.today(
- ).year + 5:
+ if release_year < 1000 or release_year > today.year + 5:
# Skip possibly bogus dates.
release_year = None
continue
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 29c608ee..c2fcdec9 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -173,7 +173,7 @@ def test_parse_datacite_dates():
Case('None is None', None, (None, None, None)),
Case('empty list is None', [], (None, None, None)),
Case('empty item is None', [{}], (None, None, None)),
- Case('empty item is None', [{'date': '2019'}], (None, None, 2019)),
+ Case('year only yields year only', [{'date': '2019'}], (None, None, 2019)),
Case('first wins', [{'date': '2019'}, {'date': '2020'}], (None, None, 2019)),
Case('skip bogus year', [{'date': 'abc'}, {'date': '2020'}], (None, None, 2020)),
Case('first with type', [
@@ -181,27 +181,36 @@ def test_parse_datacite_dates():
], (None, None, 2019)),
Case('full date', [
{'date': '2019-12-01', 'dateType': 'Valid'},
- ], (datetime.date(2019, 12, 1), None, 2019)),
+ ], (datetime.date(2019, 12, 1), 12, 2019)),
Case('date type prio', [
{'date': '2000-12-01', 'dateType': 'Valid'},
{'date': '2010-01-01', 'dateType': 'Updated'},
- ], (datetime.date(2000, 12, 1), None, 2000)),
+ ], (datetime.date(2000, 12, 1), 12, 2000)),
Case('date type prio, Available > Updated', [
{'date': '2010-01-01', 'dateType': 'Updated'},
{'date': '2000-12-01', 'dateType': 'Available'},
- ], (datetime.date(2000, 12, 1), None, 2000)),
+ ], (datetime.date(2000, 12, 1), 12, 2000)),
Case('allow different date formats, Available > Updated', [
{'date': '2010-01-01T10:00:00', 'dateType': 'Updated'},
{'date': '2000-12-01T10:00:00', 'dateType': 'Available'},
- ], (datetime.date(2000, 12, 1), None, 2000)),
+ ], (datetime.date(2000, 12, 1), 12, 2000)),
Case('allow different date formats, Available > Updated', [
{'date': '2010-01-01T10:00:00Z', 'dateType': 'Updated'},
{'date': '2000-12-01T10:00:00Z', 'dateType': 'Available'},
- ], (datetime.date(2000, 12, 1), None, 2000)),
+ ], (datetime.date(2000, 12, 1), 12, 2000)),
Case('allow fuzzy date formats, Available > Updated', [
{'date': '2010', 'dateType': 'Updated'},
{'date': '2000 Dec 01', 'dateType': 'Available'},
- ], (datetime.date(2000, 12, 1), None, 2000)),
+ ], (datetime.date(2000, 12, 1), 12, 2000)),
+ Case('fuzzy year only', [
+ {'date': 'Year 2010', 'dateType': 'Issued'},
+ ], (None, None, 2010)),
+ Case('fuzzy year and month', [
+ {'date': 'Year 2010 Feb', 'dateType': 'Issued'},
+ ], (None, 2, 2010)),
+ Case('fuzzy year, month, day', [
+ {'date': 'Year 2010 Feb 24', 'dateType': 'Issued'},
+ ], (datetime.date(2010, 2, 24), 2, 2010)),
Case('ignore broken date', [
{'date': 'Febrrr 45', 'dateType': 'Updated'},
], (None, None, None)),