diff options
Diffstat (limited to 'python')
| -rw-r--r-- | python/fatcat_tools/importers/datacite.py | 45 | ||||
| -rw-r--r-- | python/tests/import_datacite.py | 23 | 
2 files changed, 51 insertions, 17 deletions
| diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 45c8a421..5891f8de 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -9,6 +9,7 @@ functions (parse_datacite_...), which can be tested more easily.  """  from .common import EntityImporter, clean +import collections  import dateparser  import datetime  import fatcat_openapi_client @@ -783,43 +784,68 @@ def parse_datacite_dates(dates):          'Updated',      ) +    # We need to note the granularity, since a string like "2019" would be +    # parsed into "2019-01-01", even though the month is unknown. Use 3 +    # granularity types: 'y', 'm', 'd'. +    Pattern = collections.namedtuple('Pattern', 'layout granularity') +      # Before using (expensive) dateparser, try a few common patterns. -    common_patterns = ('%Y-%m-%d', '%Y-%m', '%Y-%m-%dT%H:%M:%SZ', -                       '%Y-%m-%dT%H:%M:%S', '%Y') +    common_patterns = ( +        Pattern('%Y-%m-%d', 'd'), +        Pattern('%Y-%m', 'm'), +        Pattern('%Y-%m-%dT%H:%M:%SZ', 'd'), +        Pattern('%Y-%m-%dT%H:%M:%S', 'd'), +        Pattern('%Y', 'y'), +    )      def parse_item(item):          result, value, year_only = None, item.get('date', ''), False          release_date, release_month, release_year = None, None, None -        for pattern in common_patterns: +        for layout, granularity in common_patterns:              try: -                result = datetime.datetime.strptime(value, pattern) +                result = datetime.datetime.strptime(value, layout)              except ValueError:                  continue              else: -                if pattern == '%Y': +                if granularity == 'y':                      year_only = True                  break          if result is None:              print('fallback for {}'.format(value), file=sys.stderr) +            parser = dateparser.DateDataParser()              try: -                result = dateparser.parse(value) +                # Results in a dict with keys: date_obj, period, locale. +                parse_result = parser.get_date_data(value) + +                # A datetime object, later we need a date, only. +                result = parse_result['date_obj'] +                if result is not None: +                    if parse_result['period'] == 'year': +                        return None, None, result.year +                    elif parse_result['period'] == 'month': +                        return None, result.month, result.year +                    else: +                        return result.date(), result.month, result.year              except TypeError as err:                  print("{} date parsing failed with: {}".format(value, err),                        file=sys.stderr) -                return result_date, release_month, result_year          if result is None:              # Unparsable date.              return release_date, release_month, release_year -        if not year_only: +        if granularity != 'y':              release_date = result.date()          release_year = result.year +        if granularity in ('m', 'd'): +            release_month = result.month          return release_date, release_month, release_year +    today = datetime.date.today() +      for prio in date_type_prio:          for item in dates:              if not item.get('dateType') == prio: @@ -829,8 +855,7 @@ def parse_datacite_dates(dates):              if release_date is None and release_year is None:                  continue -            if release_year < 1000 or release_year > datetime.date.today( -            ).year + 5: +            if release_year < 1000 or release_year > today.year + 5:                  # Skip possibly bogus dates.                  release_year = None                  continue diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 29c608ee..c2fcdec9 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -173,7 +173,7 @@ def test_parse_datacite_dates():          Case('None is None', None, (None, None, None)),          Case('empty list is None', [], (None, None, None)),          Case('empty item is None', [{}], (None, None, None)), -        Case('empty item is None', [{'date': '2019'}], (None, None, 2019)), +        Case('year only yields year only', [{'date': '2019'}], (None, None, 2019)),          Case('first wins', [{'date': '2019'}, {'date': '2020'}], (None, None, 2019)),          Case('skip bogus year', [{'date': 'abc'}, {'date': '2020'}], (None, None, 2020)),          Case('first with type', [ @@ -181,27 +181,36 @@ def test_parse_datacite_dates():          ], (None, None, 2019)),          Case('full date', [              {'date': '2019-12-01', 'dateType': 'Valid'}, -        ], (datetime.date(2019, 12, 1), None, 2019)), +        ], (datetime.date(2019, 12, 1), 12, 2019)),          Case('date type prio', [              {'date': '2000-12-01', 'dateType': 'Valid'},              {'date': '2010-01-01', 'dateType': 'Updated'}, -        ], (datetime.date(2000, 12, 1), None, 2000)), +        ], (datetime.date(2000, 12, 1), 12, 2000)),          Case('date type prio, Available > Updated', [              {'date': '2010-01-01', 'dateType': 'Updated'},              {'date': '2000-12-01', 'dateType': 'Available'}, -        ], (datetime.date(2000, 12, 1), None, 2000)), +        ], (datetime.date(2000, 12, 1), 12, 2000)),          Case('allow different date formats, Available > Updated', [              {'date': '2010-01-01T10:00:00', 'dateType': 'Updated'},              {'date': '2000-12-01T10:00:00', 'dateType': 'Available'}, -        ], (datetime.date(2000, 12, 1), None, 2000)), +        ], (datetime.date(2000, 12, 1), 12, 2000)),          Case('allow different date formats, Available > Updated', [              {'date': '2010-01-01T10:00:00Z', 'dateType': 'Updated'},              {'date': '2000-12-01T10:00:00Z', 'dateType': 'Available'}, -        ], (datetime.date(2000, 12, 1), None, 2000)), +        ], (datetime.date(2000, 12, 1), 12, 2000)),          Case('allow fuzzy date formats, Available > Updated', [              {'date': '2010', 'dateType': 'Updated'},              {'date': '2000 Dec 01', 'dateType': 'Available'}, -        ], (datetime.date(2000, 12, 1), None, 2000)), +        ], (datetime.date(2000, 12, 1), 12, 2000)), +        Case('fuzzy year only', [ +            {'date': 'Year 2010', 'dateType': 'Issued'}, +        ], (None, None, 2010)), +        Case('fuzzy year and month', [ +            {'date': 'Year 2010 Feb', 'dateType': 'Issued'}, +        ], (None, 2, 2010)), +        Case('fuzzy year, month, day', [ +            {'date': 'Year 2010 Feb 24', 'dateType': 'Issued'}, +        ], (datetime.date(2010, 2, 24), 2, 2010)),          Case('ignore broken date', [              {'date': 'Febrrr 45', 'dateType': 'Updated'},          ], (None, None, None)), | 
