summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-01-30 13:36:01 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-01-30 13:36:01 +0100
commit7dec2d1560ebf5ca6d0d337eb246fe345f6ec0bb (patch)
tree7acfda698ff56ce2e9690a4026fbc212fd411895 /python/fatcat_tools
parent55a4f211532c93d8164b0d4719dc0413005941ea (diff)
downloadfatcat-7dec2d1560ebf5ca6d0d337eb246fe345f6ec0bb.tar.gz
fatcat-7dec2d1560ebf5ca6d0d337eb246fe345f6ec0bb.zip
datacite: improve date handling and minor tweak
Records from https://www.micropublication.org/ did not have a date in FC, although raw data contained date strings - they were not using the finer-grained "attributes.date" but "attributes.published" and/or "attributes.publicationYear". Support for those fields has been added, including a test case. During this test (#30) a processing gap for names became clear (author may have "given_name" and "surname", but no "name"). This bug has been fixed, too.
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/importers/datacite.py61
1 files changed, 42 insertions, 19 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 2f77481a..15a10cdb 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -311,6 +311,17 @@ class DataciteImporter(EntityImporter):
release_date, release_month, release_year = parse_datacite_dates(
attributes.get('dates', []))
+ # Some records do not use the "dates" field (e.g. micropub), but:
+ # "attributes.published" or "attributes.publicationYear"
+ if not any((release_date, release_month, release_year)):
+ release_date, release_month, release_year = parse_single_date(attributes.get('publicationYear'))
+ if not any((release_date, release_month, release_year)):
+ release_date, release_month, release_year = parse_single_date(attributes.get('published'))
+
+ if not any((release_date, release_month, release_year)):
+ print('[{}] skipping record w/o date: {}'.format(doi, obj), file=sys.stderr)
+ return False
+
# Start with clear stages, e.g. published. TODO(martin): we could
# probably infer a bit more from the relations, e.g.
# "IsPreviousVersionOf" or "IsNewVersionOf".
@@ -490,7 +501,7 @@ class DataciteImporter(EntityImporter):
if len(text) > MAX_ABSTRACT_LENGTH:
text = text[:MAX_ABSTRACT_LENGTH] + " [...]"
- # Detect language.
+ # Detect language. This is fuzzy and may be removed, if too unreliable.
lang = None
try:
lang = langdetect.detect(text)
@@ -719,8 +730,10 @@ class DataciteImporter(EntityImporter):
if name:
name = clean(name)
- if not name:
+ if not any((name, given_name, surname)):
continue
+ if not name:
+ name = "{} {}".format(given_name, surname).strip()
if name in name_blacklist:
continue
if name.lower() in UNKNOWN_MARKERS_LOWER:
@@ -924,6 +937,32 @@ def parse_datacite_titles(titles):
return title, original_language_title, subtitle
+def parse_single_date(value):
+ """
+ Given a single string containing a date in arbitrary format, try to return
+ tuple (date: datetime.date, month: int, year: int).
+ """
+ if not value:
+ return None, None, None
+ if isinstance(value, int):
+ value = str(value)
+ parser = dateparser.DateDataParser()
+ try:
+ # Results in a dict with keys: date_obj, period, locale.
+ parse_result = parser.get_date_data(value)
+ # A datetime object, later we need a date, only.
+ result = parse_result['date_obj']
+ if result is not None:
+ if parse_result['period'] == 'year':
+ return None, None, result.year
+ elif parse_result['period'] == 'month':
+ return None, result.month, result.year
+ else:
+ return result.date(), result.month, result.year
+ except TypeError as err:
+ print("{} date parsing failed with: {}".format(value, err), file=sys.stderr)
+
+ return None, None, None
def parse_datacite_dates(dates):
"""
@@ -981,23 +1020,7 @@ def parse_datacite_dates(dates):
if result is None:
print('fallback for {}'.format(value), file=sys.stderr)
- parser = dateparser.DateDataParser()
- try:
- # Results in a dict with keys: date_obj, period, locale.
- parse_result = parser.get_date_data(value)
-
- # A datetime object, later we need a date, only.
- result = parse_result['date_obj']
- if result is not None:
- if parse_result['period'] == 'year':
- return None, None, result.year
- elif parse_result['period'] == 'month':
- return None, result.month, result.year
- else:
- return result.date(), result.month, result.year
- except TypeError as err:
- print("{} date parsing failed with: {}".format(value, err),
- file=sys.stderr)
+ release_date, release_month, release_year = parse_single_date(value)
if result is None:
# Unparsable date.