diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-01-30 13:36:01 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-01-30 13:36:01 +0100 |
commit | 7dec2d1560ebf5ca6d0d337eb246fe345f6ec0bb (patch) | |
tree | 7acfda698ff56ce2e9690a4026fbc212fd411895 /python/fatcat_tools/importers | |
parent | 55a4f211532c93d8164b0d4719dc0413005941ea (diff) | |
download | fatcat-7dec2d1560ebf5ca6d0d337eb246fe345f6ec0bb.tar.gz fatcat-7dec2d1560ebf5ca6d0d337eb246fe345f6ec0bb.zip |
datacite: improve date handling and minor tweak
Records from https://www.micropublication.org/ did not have a date in
FC, although raw data contained date strings - they were not using the
finer-grained "attributes.date" but "attributes.published" and/or
"attributes.publicationYear".
Support for those fields has been added, including a test case.
During this test (#30) a processing gap for names became clear (author
may have "given_name" and "surname", but no "name"). This bug has been
fixed, too.
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 61 |
1 files changed, 42 insertions, 19 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 2f77481a..15a10cdb 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -311,6 +311,17 @@ class DataciteImporter(EntityImporter): release_date, release_month, release_year = parse_datacite_dates( attributes.get('dates', [])) + # Some records do not use the "dates" field (e.g. micropub), but: + # "attributes.published" or "attributes.publicationYear" + if not any((release_date, release_month, release_year)): + release_date, release_month, release_year = parse_single_date(attributes.get('publicationYear')) + if not any((release_date, release_month, release_year)): + release_date, release_month, release_year = parse_single_date(attributes.get('published')) + + if not any((release_date, release_month, release_year)): + print('[{}] skipping record w/o date: {}'.format(doi, obj), file=sys.stderr) + return False + # Start with clear stages, e.g. published. TODO(martin): we could # probably infer a bit more from the relations, e.g. # "IsPreviousVersionOf" or "IsNewVersionOf". @@ -490,7 +501,7 @@ class DataciteImporter(EntityImporter): if len(text) > MAX_ABSTRACT_LENGTH: text = text[:MAX_ABSTRACT_LENGTH] + " [...]" - # Detect language. + # Detect language. This is fuzzy and may be removed, if too unreliable. lang = None try: lang = langdetect.detect(text) @@ -719,8 +730,10 @@ class DataciteImporter(EntityImporter): if name: name = clean(name) - if not name: + if not any((name, given_name, surname)): continue + if not name: + name = "{} {}".format(given_name, surname).strip() if name in name_blacklist: continue if name.lower() in UNKNOWN_MARKERS_LOWER: @@ -924,6 +937,32 @@ def parse_datacite_titles(titles): return title, original_language_title, subtitle +def parse_single_date(value): + """ + Given a single string containing a date in arbitrary format, try to return + tuple (date: datetime.date, month: int, year: int). + """ + if not value: + return None, None, None + if isinstance(value, int): + value = str(value) + parser = dateparser.DateDataParser() + try: + # Results in a dict with keys: date_obj, period, locale. + parse_result = parser.get_date_data(value) + # A datetime object, later we need a date, only. + result = parse_result['date_obj'] + if result is not None: + if parse_result['period'] == 'year': + return None, None, result.year + elif parse_result['period'] == 'month': + return None, result.month, result.year + else: + return result.date(), result.month, result.year + except TypeError as err: + print("{} date parsing failed with: {}".format(value, err), file=sys.stderr) + + return None, None, None def parse_datacite_dates(dates): """ @@ -981,23 +1020,7 @@ def parse_datacite_dates(dates): if result is None: print('fallback for {}'.format(value), file=sys.stderr) - parser = dateparser.DateDataParser() - try: - # Results in a dict with keys: date_obj, period, locale. - parse_result = parser.get_date_data(value) - - # A datetime object, later we need a date, only. - result = parse_result['date_obj'] - if result is not None: - if parse_result['period'] == 'year': - return None, None, result.year - elif parse_result['period'] == 'month': - return None, result.month, result.year - else: - return result.date(), result.month, result.year - except TypeError as err: - print("{} date parsing failed with: {}".format(value, err), - file=sys.stderr) + release_date, release_month, release_year = parse_single_date(value) if result is None: # Unparsable date. |