From 7dec2d1560ebf5ca6d0d337eb246fe345f6ec0bb Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 30 Jan 2020 13:36:01 +0100 Subject: datacite: improve date handling and minor tweak Records from https://www.micropublication.org/ did not have a date in FC, although raw data contained date strings - they were not using the finer-grained "attributes.date" but "attributes.published" and/or "attributes.publicationYear". Support for those fields has been added, including a test case. During this test (#30) a processing gap for names became clear (author may have "given_name" and "surname", but no "name"). This bug has been fixed, too. --- python/fatcat_tools/importers/datacite.py | 61 ++++++++++++------ python/tests/files/datacite/datacite_doc_30.json | 72 ++++++++++++++++++++++ .../tests/files/datacite/datacite_result_30.json | 38 ++++++++++++ python/tests/import_datacite.py | 3 +- 4 files changed, 153 insertions(+), 21 deletions(-) create mode 100644 python/tests/files/datacite/datacite_doc_30.json create mode 100644 python/tests/files/datacite/datacite_result_30.json diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 2f77481a..15a10cdb 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -311,6 +311,17 @@ class DataciteImporter(EntityImporter): release_date, release_month, release_year = parse_datacite_dates( attributes.get('dates', [])) + # Some records do not use the "dates" field (e.g. micropub), but: + # "attributes.published" or "attributes.publicationYear" + if not any((release_date, release_month, release_year)): + release_date, release_month, release_year = parse_single_date(attributes.get('publicationYear')) + if not any((release_date, release_month, release_year)): + release_date, release_month, release_year = parse_single_date(attributes.get('published')) + + if not any((release_date, release_month, release_year)): + print('[{}] skipping record w/o date: {}'.format(doi, obj), file=sys.stderr) + return False + # Start with clear stages, e.g. published. TODO(martin): we could # probably infer a bit more from the relations, e.g. # "IsPreviousVersionOf" or "IsNewVersionOf". @@ -490,7 +501,7 @@ class DataciteImporter(EntityImporter): if len(text) > MAX_ABSTRACT_LENGTH: text = text[:MAX_ABSTRACT_LENGTH] + " [...]" - # Detect language. + # Detect language. This is fuzzy and may be removed, if too unreliable. lang = None try: lang = langdetect.detect(text) @@ -719,8 +730,10 @@ class DataciteImporter(EntityImporter): if name: name = clean(name) - if not name: + if not any((name, given_name, surname)): continue + if not name: + name = "{} {}".format(given_name, surname).strip() if name in name_blacklist: continue if name.lower() in UNKNOWN_MARKERS_LOWER: @@ -924,6 +937,32 @@ def parse_datacite_titles(titles): return title, original_language_title, subtitle +def parse_single_date(value): + """ + Given a single string containing a date in arbitrary format, try to return + tuple (date: datetime.date, month: int, year: int). + """ + if not value: + return None, None, None + if isinstance(value, int): + value = str(value) + parser = dateparser.DateDataParser() + try: + # Results in a dict with keys: date_obj, period, locale. + parse_result = parser.get_date_data(value) + # A datetime object, later we need a date, only. + result = parse_result['date_obj'] + if result is not None: + if parse_result['period'] == 'year': + return None, None, result.year + elif parse_result['period'] == 'month': + return None, result.month, result.year + else: + return result.date(), result.month, result.year + except TypeError as err: + print("{} date parsing failed with: {}".format(value, err), file=sys.stderr) + + return None, None, None def parse_datacite_dates(dates): """ @@ -981,23 +1020,7 @@ def parse_datacite_dates(dates): if result is None: print('fallback for {}'.format(value), file=sys.stderr) - parser = dateparser.DateDataParser() - try: - # Results in a dict with keys: date_obj, period, locale. - parse_result = parser.get_date_data(value) - - # A datetime object, later we need a date, only. - result = parse_result['date_obj'] - if result is not None: - if parse_result['period'] == 'year': - return None, None, result.year - elif parse_result['period'] == 'month': - return None, result.month, result.year - else: - return result.date(), result.month, result.year - except TypeError as err: - print("{} date parsing failed with: {}".format(value, err), - file=sys.stderr) + release_date, release_month, release_year = parse_single_date(value) if result is None: # Unparsable date. diff --git a/python/tests/files/datacite/datacite_doc_30.json b/python/tests/files/datacite/datacite_doc_30.json new file mode 100644 index 00000000..5f851bbb --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_30.json @@ -0,0 +1,72 @@ +{ + "id": "10.17912/micropub.biology.000143", + "type": "dois", + "attributes": { + "doi": "10.17912/micropub.biology.000143", + "identifiers": null, + "creators": [ + { + "raw_name": "Celja J Uebel", + "givenName": "Celja J", + "familyName": "Uebel", + "affiliation": [], + "role": "author" + }, + { + "raw_name": "Carolyn M Phillips", + "givenName": "Carolyn M", + "familyName": "Phillips", + "affiliation": [], + "role": "author" + } + ], + "titles": [ + { + "title": "Phase-separated protein dynamics are affected by fluorescent tag choice" + } + ], + "publisher": "microPublication Biology", + "container": {}, + "publicationYear": 2019, + "subjects": [], + "contributors": [], + "dates": null, + "language": null, + "types": { + "resourceTypeGeneral": "DataPaper" + }, + "relatedIdentifiers": [], + "sizes": [], + "formats": [], + "version": null, + "rightsList": [], + "descriptions": [ + { + "description": "Biological liquid-liquid phase separation", + "descriptionType": "Abstract" + } + ], + "geoLocations": [], + "fundingReferences": [], + "url": "https://www.micropublication.org/journals/biology/micropub.biology.000143", + "contentUrl": null, + "metadataVersion": 0, + "schemaVersion": null, + "source": null, + "isActive": true, + "state": "findable", + "reason": null, + "created": "2019-08-19T14:43:08.000Z", + "registered": "2019-08-19T14:43:09.000Z", + "published": "2019", + "updated": "2019-11-09T12:32:02.000Z" + }, + "relationships": { + "client": { + "data": { + "id": "caltech.micropub", + "type": "clients" + } + } + } +} diff --git a/python/tests/files/datacite/datacite_result_30.json b/python/tests/files/datacite/datacite_result_30.json new file mode 100644 index 00000000..f7d1bb2c --- /dev/null +++ b/python/tests/files/datacite/datacite_result_30.json @@ -0,0 +1,38 @@ +{ + "abstracts": [ + { + "content": "Biological liquid-liquid phase separation", + "lang": "fr", + "mimetype": "text/plain" + } + ], + "contribs": [ + { + "index": 0, + "given_name": "Celja J", + "surname": "Uebel", + "raw_name": "Celja J Uebel", + "role": "author" + }, + { + "index": 1, + "given_name": "Carolyn M", + "raw_name": "Carolyn M Phillips", + "surname": "Phillips", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.17912/micropub.biology.000143" + }, + "extra": { + "datacite": { + "resourceTypeGeneral": "DataPaper" + } + }, + "refs": [], + "release_stage": "published", + "release_year": 2019, + "publisher": "microPublication Biology", + "title": "Phase-separated protein dynamics are affected by fluorescent tag choice" +} diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 669a6984..15650375 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -287,10 +287,9 @@ def test_datacite_conversions(datacite_importer): for now. """ datacite_importer.debug = True - for i in range(30): + for i in range(31): src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i) dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i) - print('testing mapping from {} => {}'.format(src, dst)) with open(src, 'r') as f: re = datacite_importer.parse_record(json.load(f)) result = entity_to_dict(re) -- cgit v1.2.3