diff options
| author | Martin Czygan <martin.czygan@gmail.com> | 2020-01-03 19:51:53 +0100 | 
|---|---|---|
| committer | Martin Czygan <martin.czygan@gmail.com> | 2020-01-03 19:51:53 +0100 | 
| commit | e4402d6d4b162d57507d5beb57de88017cea549d (patch) | |
| tree | 6f409c5a98d52028611122409643457b5e06278c | |
| parent | 1e5680202fe2bf0348f969ffd0e4b211cc45e1e5 (diff) | |
| download | fatcat-e4402d6d4b162d57507d5beb57de88017cea549d.tar.gz fatcat-e4402d6d4b162d57507d5beb57de88017cea549d.zip | |
datacite: prepare release_month (stub)
| -rw-r--r-- | python/fatcat_tools/importers/datacite.py | 20 | ||||
| -rw-r--r-- | python/tests/import_datacite.py | 28 | 
2 files changed, 24 insertions, 24 deletions
| diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index d13e855e..45c8a421 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -378,7 +378,7 @@ class DataciteImporter(EntityImporter):          # "attributes.dates[].dateType", values: "Accepted", "Available"          # "Collected", "Copyrighted", "Created", "Issued", "Submitted",          # "Updated", "Valid". -        release_date, release_year = parse_datacite_dates( +        release_date, release_month, release_year = parse_datacite_dates(              attributes.get('dates', []))          # Start with clear stages, e.g. published. TODO(martin): we could @@ -762,10 +762,10 @@ def parse_datacite_dates(dates):      Given a list of date fields (under .dates), return tuple, (release_date,      release_year).      """ -    release_date, release_year = None, None +    release_date, release_month, release_year = None, None, None      if not dates: -        return release_date, release_year +        return release_date, release_month, release_year      if not isinstance(dates, list):          raise ValueError('expected a list of date items') @@ -789,7 +789,7 @@ def parse_datacite_dates(dates):      def parse_item(item):          result, value, year_only = None, item.get('date', ''), False -        release_date, release_year = None, None +        release_date, release_month, release_year = None, None, None          for pattern in common_patterns:              try: @@ -808,24 +808,24 @@ def parse_datacite_dates(dates):              except TypeError as err:                  print("{} date parsing failed with: {}".format(value, err),                        file=sys.stderr) -                return result_date, result_year +                return result_date, release_month, result_year          if result is None:              # Unparsable date. -            return release_date, release_year +            return release_date, release_month, release_year          if not year_only:              release_date = result.date()          release_year = result.year -        return release_date, release_year +        return release_date, release_month, release_year      for prio in date_type_prio:          for item in dates:              if not item.get('dateType') == prio:                  continue -            release_date, release_year = parse_item(item) +            release_date, release_month, release_year = parse_item(item)              if release_date is None and release_year is None:                  continue @@ -841,11 +841,11 @@ def parse_datacite_dates(dates):      if release_date is None and release_year is None:          for item in dates: -            release_date, release_year = parse_item(item) +            release_date, release_month, release_year = parse_item(item)              if release_year or release_date:                  break -    return release_date, release_year +    return release_date, release_month, release_year  def clean_doi(doi):      """ diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 54a529c5..29c608ee 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -170,41 +170,41 @@ def test_parse_datacite_dates():      """      Case = collections.namedtuple('Case', 'about input result')      cases = [ -        Case('None is None', None, (None, None)), -        Case('empty list is None', [], (None, None)), -        Case('empty item is None', [{}], (None, None)), -        Case('empty item is None', [{'date': '2019'}], (None, 2019)), -        Case('first wins', [{'date': '2019'}, {'date': '2020'}], (None, 2019)), -        Case('skip bogus year', [{'date': 'abc'}, {'date': '2020'}], (None, 2020)), +        Case('None is None', None, (None, None, None)), +        Case('empty list is None', [], (None, None, None)), +        Case('empty item is None', [{}], (None, None, None)), +        Case('empty item is None', [{'date': '2019'}], (None, None, 2019)), +        Case('first wins', [{'date': '2019'}, {'date': '2020'}], (None, None, 2019)), +        Case('skip bogus year', [{'date': 'abc'}, {'date': '2020'}], (None, None, 2020)),          Case('first with type', [              {'date': '2019', 'dateType': 'Accepted'}, {'date': '2020'} -        ], (None, 2019)), +        ], (None, None, 2019)),          Case('full date', [              {'date': '2019-12-01', 'dateType': 'Valid'}, -        ], (datetime.date(2019, 12, 1), 2019)), +        ], (datetime.date(2019, 12, 1), None, 2019)),          Case('date type prio', [              {'date': '2000-12-01', 'dateType': 'Valid'},              {'date': '2010-01-01', 'dateType': 'Updated'}, -        ], (datetime.date(2000, 12, 1), 2000)), +        ], (datetime.date(2000, 12, 1), None, 2000)),          Case('date type prio, Available > Updated', [              {'date': '2010-01-01', 'dateType': 'Updated'},              {'date': '2000-12-01', 'dateType': 'Available'}, -        ], (datetime.date(2000, 12, 1), 2000)), +        ], (datetime.date(2000, 12, 1), None, 2000)),          Case('allow different date formats, Available > Updated', [              {'date': '2010-01-01T10:00:00', 'dateType': 'Updated'},              {'date': '2000-12-01T10:00:00', 'dateType': 'Available'}, -        ], (datetime.date(2000, 12, 1), 2000)), +        ], (datetime.date(2000, 12, 1), None, 2000)),          Case('allow different date formats, Available > Updated', [              {'date': '2010-01-01T10:00:00Z', 'dateType': 'Updated'},              {'date': '2000-12-01T10:00:00Z', 'dateType': 'Available'}, -        ], (datetime.date(2000, 12, 1), 2000)), +        ], (datetime.date(2000, 12, 1), None, 2000)),          Case('allow fuzzy date formats, Available > Updated', [              {'date': '2010', 'dateType': 'Updated'},              {'date': '2000 Dec 01', 'dateType': 'Available'}, -        ], (datetime.date(2000, 12, 1), 2000)), +        ], (datetime.date(2000, 12, 1), None, 2000)),          Case('ignore broken date', [              {'date': 'Febrrr 45', 'dateType': 'Updated'}, -        ], (None, None)), +        ], (None, None, None)),      ]      for case in cases:          result = parse_datacite_dates(case.input) | 
