From 7dec2d1560ebf5ca6d0d337eb246fe345f6ec0bb Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 30 Jan 2020 13:36:01 +0100 Subject: datacite: improve date handling and minor tweak Records from https://www.micropublication.org/ did not have a date in FC, although raw data contained date strings - they were not using the finer-grained "attributes.date" but "attributes.published" and/or "attributes.publicationYear". Support for those fields has been added, including a test case. During this test (#30) a processing gap for names became clear (author may have "given_name" and "surname", but no "name"). This bug has been fixed, too. --- python/fatcat_tools/importers/datacite.py | 61 +++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 19 deletions(-) (limited to 'python/fatcat_tools/importers') diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 2f77481a..15a10cdb 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -311,6 +311,17 @@ class DataciteImporter(EntityImporter): release_date, release_month, release_year = parse_datacite_dates( attributes.get('dates', [])) + # Some records do not use the "dates" field (e.g. micropub), but: + # "attributes.published" or "attributes.publicationYear" + if not any((release_date, release_month, release_year)): + release_date, release_month, release_year = parse_single_date(attributes.get('publicationYear')) + if not any((release_date, release_month, release_year)): + release_date, release_month, release_year = parse_single_date(attributes.get('published')) + + if not any((release_date, release_month, release_year)): + print('[{}] skipping record w/o date: {}'.format(doi, obj), file=sys.stderr) + return False + # Start with clear stages, e.g. published. TODO(martin): we could # probably infer a bit more from the relations, e.g. # "IsPreviousVersionOf" or "IsNewVersionOf". @@ -490,7 +501,7 @@ class DataciteImporter(EntityImporter): if len(text) > MAX_ABSTRACT_LENGTH: text = text[:MAX_ABSTRACT_LENGTH] + " [...]" - # Detect language. + # Detect language. This is fuzzy and may be removed, if too unreliable. lang = None try: lang = langdetect.detect(text) @@ -719,8 +730,10 @@ class DataciteImporter(EntityImporter): if name: name = clean(name) - if not name: + if not any((name, given_name, surname)): continue + if not name: + name = "{} {}".format(given_name, surname).strip() if name in name_blacklist: continue if name.lower() in UNKNOWN_MARKERS_LOWER: @@ -924,6 +937,32 @@ def parse_datacite_titles(titles): return title, original_language_title, subtitle +def parse_single_date(value): + """ + Given a single string containing a date in arbitrary format, try to return + tuple (date: datetime.date, month: int, year: int). + """ + if not value: + return None, None, None + if isinstance(value, int): + value = str(value) + parser = dateparser.DateDataParser() + try: + # Results in a dict with keys: date_obj, period, locale. + parse_result = parser.get_date_data(value) + # A datetime object, later we need a date, only. + result = parse_result['date_obj'] + if result is not None: + if parse_result['period'] == 'year': + return None, None, result.year + elif parse_result['period'] == 'month': + return None, result.month, result.year + else: + return result.date(), result.month, result.year + except TypeError as err: + print("{} date parsing failed with: {}".format(value, err), file=sys.stderr) + + return None, None, None def parse_datacite_dates(dates): """ @@ -981,23 +1020,7 @@ def parse_datacite_dates(dates): if result is None: print('fallback for {}'.format(value), file=sys.stderr) - parser = dateparser.DateDataParser() - try: - # Results in a dict with keys: date_obj, period, locale. - parse_result = parser.get_date_data(value) - - # A datetime object, later we need a date, only. - result = parse_result['date_obj'] - if result is not None: - if parse_result['period'] == 'year': - return None, None, result.year - elif parse_result['period'] == 'month': - return None, result.month, result.year - else: - return result.date(), result.month, result.year - except TypeError as err: - print("{} date parsing failed with: {}".format(value, err), - file=sys.stderr) + release_date, release_month, release_year = parse_single_date(value) if result is None: # Unparsable date. -- cgit v1.2.3 From 046630521f7d3134c9197f5eeae9077154f21991 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 31 Jan 2020 01:43:36 +0100 Subject: datacite: improve docstring --- python/fatcat_tools/importers/datacite.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'python/fatcat_tools/importers') diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 15a10cdb..82ed5a0c 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -1,11 +1,11 @@ """ Prototype importer for datacite.org data. -Example input document at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8. +Example input document: https://api.datacite.org/dois/10.7916/d8-f93n-rk51 -Datacite being an aggregator, the data is varied and exposes a couple of -problems in content and structure. A few fields habe their own parsing -functions (parse_datacite_...), which can be tested more easily. +Datacite being an aggregator, the data is heterogenous and exposes a couple of +problems in content and structure. A few fields have their own parsing +functions (parse_datacite_...), which may help testing. """ import collections -- cgit v1.2.3 From 706a2d388f22ada2e3c88b7c58d3a74b1290a6a0 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 31 Jan 2020 01:44:11 +0100 Subject: datacite: do not skip records w/o date --- python/fatcat_tools/importers/datacite.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'python/fatcat_tools/importers') diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 82ed5a0c..f1049909 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -319,8 +319,7 @@ class DataciteImporter(EntityImporter): release_date, release_month, release_year = parse_single_date(attributes.get('published')) if not any((release_date, release_month, release_year)): - print('[{}] skipping record w/o date: {}'.format(doi, obj), file=sys.stderr) - return False + print('[{}] record w/o date: {}'.format(doi, obj), file=sys.stderr) # Start with clear stages, e.g. published. TODO(martin): we could # probably infer a bit more from the relations, e.g. -- cgit v1.2.3 From a42206d2603e28f1311ac3873dc168c78eabffee Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 31 Jan 2020 01:44:46 +0100 Subject: datacite: add exception for https://www.micropublication.org/ --- python/fatcat_tools/importers/datacite.py | 5 +++++ python/tests/files/datacite/datacite_result_30.json | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'python/fatcat_tools/importers') diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index f1049909..b060a18e 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -390,6 +390,11 @@ class DataciteImporter(EntityImporter): len(container_name))) container_name = container_name[0] + # Exception: https://www.micropublication.org/, see: !MR24. + if container_id is None and container_name is None: + if publisher and publisher.lower().startswith('micropublication'): + container_name = publisher + # Volume and issue. volume = container.get('volume') issue = container.get('issue') diff --git a/python/tests/files/datacite/datacite_result_30.json b/python/tests/files/datacite/datacite_result_30.json index f7d1bb2c..fc2c4dfc 100644 --- a/python/tests/files/datacite/datacite_result_30.json +++ b/python/tests/files/datacite/datacite_result_30.json @@ -28,7 +28,8 @@ "extra": { "datacite": { "resourceTypeGeneral": "DataPaper" - } + }, + "container_name": "microPublication Biology" }, "refs": [], "release_stage": "published", -- cgit v1.2.3 From 4c9208fa61ee8eeaebddd65f641926b540818342 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 31 Jan 2020 01:45:25 +0100 Subject: datacite: name shall not be None --- python/fatcat_tools/importers/datacite.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'python/fatcat_tools/importers') diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index b060a18e..20fc399c 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -737,7 +737,7 @@ class DataciteImporter(EntityImporter): if not any((name, given_name, surname)): continue if not name: - name = "{} {}".format(given_name, surname).strip() + name = "{} {}".format(given_name or '', surname or '').strip() if name in name_blacklist: continue if name.lower() in UNKNOWN_MARKERS_LOWER: -- cgit v1.2.3