diff options
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 20 | ||||
-rw-r--r-- | python/tests/import_datacite.py | 28 |
2 files changed, 24 insertions, 24 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index d13e855e..45c8a421 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -378,7 +378,7 @@ class DataciteImporter(EntityImporter): # "attributes.dates[].dateType", values: "Accepted", "Available" # "Collected", "Copyrighted", "Created", "Issued", "Submitted", # "Updated", "Valid". - release_date, release_year = parse_datacite_dates( + release_date, release_month, release_year = parse_datacite_dates( attributes.get('dates', [])) # Start with clear stages, e.g. published. TODO(martin): we could @@ -762,10 +762,10 @@ def parse_datacite_dates(dates): Given a list of date fields (under .dates), return tuple, (release_date, release_year). """ - release_date, release_year = None, None + release_date, release_month, release_year = None, None, None if not dates: - return release_date, release_year + return release_date, release_month, release_year if not isinstance(dates, list): raise ValueError('expected a list of date items') @@ -789,7 +789,7 @@ def parse_datacite_dates(dates): def parse_item(item): result, value, year_only = None, item.get('date', ''), False - release_date, release_year = None, None + release_date, release_month, release_year = None, None, None for pattern in common_patterns: try: @@ -808,24 +808,24 @@ def parse_datacite_dates(dates): except TypeError as err: print("{} date parsing failed with: {}".format(value, err), file=sys.stderr) - return result_date, result_year + return result_date, release_month, result_year if result is None: # Unparsable date. - return release_date, release_year + return release_date, release_month, release_year if not year_only: release_date = result.date() release_year = result.year - return release_date, release_year + return release_date, release_month, release_year for prio in date_type_prio: for item in dates: if not item.get('dateType') == prio: continue - release_date, release_year = parse_item(item) + release_date, release_month, release_year = parse_item(item) if release_date is None and release_year is None: continue @@ -841,11 +841,11 @@ def parse_datacite_dates(dates): if release_date is None and release_year is None: for item in dates: - release_date, release_year = parse_item(item) + release_date, release_month, release_year = parse_item(item) if release_year or release_date: break - return release_date, release_year + return release_date, release_month, release_year def clean_doi(doi): """ diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 54a529c5..29c608ee 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -170,41 +170,41 @@ def test_parse_datacite_dates(): """ Case = collections.namedtuple('Case', 'about input result') cases = [ - Case('None is None', None, (None, None)), - Case('empty list is None', [], (None, None)), - Case('empty item is None', [{}], (None, None)), - Case('empty item is None', [{'date': '2019'}], (None, 2019)), - Case('first wins', [{'date': '2019'}, {'date': '2020'}], (None, 2019)), - Case('skip bogus year', [{'date': 'abc'}, {'date': '2020'}], (None, 2020)), + Case('None is None', None, (None, None, None)), + Case('empty list is None', [], (None, None, None)), + Case('empty item is None', [{}], (None, None, None)), + Case('empty item is None', [{'date': '2019'}], (None, None, 2019)), + Case('first wins', [{'date': '2019'}, {'date': '2020'}], (None, None, 2019)), + Case('skip bogus year', [{'date': 'abc'}, {'date': '2020'}], (None, None, 2020)), Case('first with type', [ {'date': '2019', 'dateType': 'Accepted'}, {'date': '2020'} - ], (None, 2019)), + ], (None, None, 2019)), Case('full date', [ {'date': '2019-12-01', 'dateType': 'Valid'}, - ], (datetime.date(2019, 12, 1), 2019)), + ], (datetime.date(2019, 12, 1), None, 2019)), Case('date type prio', [ {'date': '2000-12-01', 'dateType': 'Valid'}, {'date': '2010-01-01', 'dateType': 'Updated'}, - ], (datetime.date(2000, 12, 1), 2000)), + ], (datetime.date(2000, 12, 1), None, 2000)), Case('date type prio, Available > Updated', [ {'date': '2010-01-01', 'dateType': 'Updated'}, {'date': '2000-12-01', 'dateType': 'Available'}, - ], (datetime.date(2000, 12, 1), 2000)), + ], (datetime.date(2000, 12, 1), None, 2000)), Case('allow different date formats, Available > Updated', [ {'date': '2010-01-01T10:00:00', 'dateType': 'Updated'}, {'date': '2000-12-01T10:00:00', 'dateType': 'Available'}, - ], (datetime.date(2000, 12, 1), 2000)), + ], (datetime.date(2000, 12, 1), None, 2000)), Case('allow different date formats, Available > Updated', [ {'date': '2010-01-01T10:00:00Z', 'dateType': 'Updated'}, {'date': '2000-12-01T10:00:00Z', 'dateType': 'Available'}, - ], (datetime.date(2000, 12, 1), 2000)), + ], (datetime.date(2000, 12, 1), None, 2000)), Case('allow fuzzy date formats, Available > Updated', [ {'date': '2010', 'dateType': 'Updated'}, {'date': '2000 Dec 01', 'dateType': 'Available'}, - ], (datetime.date(2000, 12, 1), 2000)), + ], (datetime.date(2000, 12, 1), None, 2000)), Case('ignore broken date', [ {'date': 'Febrrr 45', 'dateType': 'Updated'}, - ], (None, None)), + ], (None, None, None)), ] for case in cases: result = parse_datacite_dates(case.input) |