summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/fatcat_tools/importers/datacite.py20
-rw-r--r--python/tests/import_datacite.py28
2 files changed, 24 insertions, 24 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index d13e855e..45c8a421 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -378,7 +378,7 @@ class DataciteImporter(EntityImporter):
# "attributes.dates[].dateType", values: "Accepted", "Available"
# "Collected", "Copyrighted", "Created", "Issued", "Submitted",
# "Updated", "Valid".
- release_date, release_year = parse_datacite_dates(
+ release_date, release_month, release_year = parse_datacite_dates(
attributes.get('dates', []))
# Start with clear stages, e.g. published. TODO(martin): we could
@@ -762,10 +762,10 @@ def parse_datacite_dates(dates):
Given a list of date fields (under .dates), return tuple, (release_date,
release_year).
"""
- release_date, release_year = None, None
+ release_date, release_month, release_year = None, None, None
if not dates:
- return release_date, release_year
+ return release_date, release_month, release_year
if not isinstance(dates, list):
raise ValueError('expected a list of date items')
@@ -789,7 +789,7 @@ def parse_datacite_dates(dates):
def parse_item(item):
result, value, year_only = None, item.get('date', ''), False
- release_date, release_year = None, None
+ release_date, release_month, release_year = None, None, None
for pattern in common_patterns:
try:
@@ -808,24 +808,24 @@ def parse_datacite_dates(dates):
except TypeError as err:
print("{} date parsing failed with: {}".format(value, err),
file=sys.stderr)
- return result_date, result_year
+ return result_date, release_month, result_year
if result is None:
# Unparsable date.
- return release_date, release_year
+ return release_date, release_month, release_year
if not year_only:
release_date = result.date()
release_year = result.year
- return release_date, release_year
+ return release_date, release_month, release_year
for prio in date_type_prio:
for item in dates:
if not item.get('dateType') == prio:
continue
- release_date, release_year = parse_item(item)
+ release_date, release_month, release_year = parse_item(item)
if release_date is None and release_year is None:
continue
@@ -841,11 +841,11 @@ def parse_datacite_dates(dates):
if release_date is None and release_year is None:
for item in dates:
- release_date, release_year = parse_item(item)
+ release_date, release_month, release_year = parse_item(item)
if release_year or release_date:
break
- return release_date, release_year
+ return release_date, release_month, release_year
def clean_doi(doi):
"""
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 54a529c5..29c608ee 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -170,41 +170,41 @@ def test_parse_datacite_dates():
"""
Case = collections.namedtuple('Case', 'about input result')
cases = [
- Case('None is None', None, (None, None)),
- Case('empty list is None', [], (None, None)),
- Case('empty item is None', [{}], (None, None)),
- Case('empty item is None', [{'date': '2019'}], (None, 2019)),
- Case('first wins', [{'date': '2019'}, {'date': '2020'}], (None, 2019)),
- Case('skip bogus year', [{'date': 'abc'}, {'date': '2020'}], (None, 2020)),
+ Case('None is None', None, (None, None, None)),
+ Case('empty list is None', [], (None, None, None)),
+ Case('empty item is None', [{}], (None, None, None)),
+ Case('empty item is None', [{'date': '2019'}], (None, None, 2019)),
+ Case('first wins', [{'date': '2019'}, {'date': '2020'}], (None, None, 2019)),
+ Case('skip bogus year', [{'date': 'abc'}, {'date': '2020'}], (None, None, 2020)),
Case('first with type', [
{'date': '2019', 'dateType': 'Accepted'}, {'date': '2020'}
- ], (None, 2019)),
+ ], (None, None, 2019)),
Case('full date', [
{'date': '2019-12-01', 'dateType': 'Valid'},
- ], (datetime.date(2019, 12, 1), 2019)),
+ ], (datetime.date(2019, 12, 1), None, 2019)),
Case('date type prio', [
{'date': '2000-12-01', 'dateType': 'Valid'},
{'date': '2010-01-01', 'dateType': 'Updated'},
- ], (datetime.date(2000, 12, 1), 2000)),
+ ], (datetime.date(2000, 12, 1), None, 2000)),
Case('date type prio, Available > Updated', [
{'date': '2010-01-01', 'dateType': 'Updated'},
{'date': '2000-12-01', 'dateType': 'Available'},
- ], (datetime.date(2000, 12, 1), 2000)),
+ ], (datetime.date(2000, 12, 1), None, 2000)),
Case('allow different date formats, Available > Updated', [
{'date': '2010-01-01T10:00:00', 'dateType': 'Updated'},
{'date': '2000-12-01T10:00:00', 'dateType': 'Available'},
- ], (datetime.date(2000, 12, 1), 2000)),
+ ], (datetime.date(2000, 12, 1), None, 2000)),
Case('allow different date formats, Available > Updated', [
{'date': '2010-01-01T10:00:00Z', 'dateType': 'Updated'},
{'date': '2000-12-01T10:00:00Z', 'dateType': 'Available'},
- ], (datetime.date(2000, 12, 1), 2000)),
+ ], (datetime.date(2000, 12, 1), None, 2000)),
Case('allow fuzzy date formats, Available > Updated', [
{'date': '2010', 'dateType': 'Updated'},
{'date': '2000 Dec 01', 'dateType': 'Available'},
- ], (datetime.date(2000, 12, 1), 2000)),
+ ], (datetime.date(2000, 12, 1), None, 2000)),
Case('ignore broken date', [
{'date': 'Febrrr 45', 'dateType': 'Updated'},
- ], (None, None)),
+ ], (None, None, None)),
]
for case in cases:
result = parse_datacite_dates(case.input)