diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 73 | ||||
-rw-r--r-- | python/tests/files/datacite/datacite_doc_30.json | 72 | ||||
-rw-r--r-- | python/tests/files/datacite/datacite_result_30.json | 39 | ||||
-rw-r--r-- | python/tests/import_datacite.py | 3 |
4 files changed, 162 insertions, 25 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 2f77481a..20fc399c 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -1,11 +1,11 @@ """ Prototype importer for datacite.org data. -Example input document at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8. +Example input document: https://api.datacite.org/dois/10.7916/d8-f93n-rk51 -Datacite being an aggregator, the data is varied and exposes a couple of -problems in content and structure. A few fields habe their own parsing -functions (parse_datacite_...), which can be tested more easily. +Datacite being an aggregator, the data is heterogenous and exposes a couple of +problems in content and structure. A few fields have their own parsing +functions (parse_datacite_...), which may help testing. """ import collections @@ -311,6 +311,16 @@ class DataciteImporter(EntityImporter): release_date, release_month, release_year = parse_datacite_dates( attributes.get('dates', [])) + # Some records do not use the "dates" field (e.g. micropub), but: + # "attributes.published" or "attributes.publicationYear" + if not any((release_date, release_month, release_year)): + release_date, release_month, release_year = parse_single_date(attributes.get('publicationYear')) + if not any((release_date, release_month, release_year)): + release_date, release_month, release_year = parse_single_date(attributes.get('published')) + + if not any((release_date, release_month, release_year)): + print('[{}] record w/o date: {}'.format(doi, obj), file=sys.stderr) + # Start with clear stages, e.g. published. TODO(martin): we could # probably infer a bit more from the relations, e.g. # "IsPreviousVersionOf" or "IsNewVersionOf". @@ -380,6 +390,11 @@ class DataciteImporter(EntityImporter): len(container_name))) container_name = container_name[0] + # Exception: https://www.micropublication.org/, see: !MR24. + if container_id is None and container_name is None: + if publisher and publisher.lower().startswith('micropublication'): + container_name = publisher + # Volume and issue. volume = container.get('volume') issue = container.get('issue') @@ -490,7 +505,7 @@ class DataciteImporter(EntityImporter): if len(text) > MAX_ABSTRACT_LENGTH: text = text[:MAX_ABSTRACT_LENGTH] + " [...]" - # Detect language. + # Detect language. This is fuzzy and may be removed, if too unreliable. lang = None try: lang = langdetect.detect(text) @@ -719,8 +734,10 @@ class DataciteImporter(EntityImporter): if name: name = clean(name) - if not name: + if not any((name, given_name, surname)): continue + if not name: + name = "{} {}".format(given_name or '', surname or '').strip() if name in name_blacklist: continue if name.lower() in UNKNOWN_MARKERS_LOWER: @@ -924,6 +941,32 @@ def parse_datacite_titles(titles): return title, original_language_title, subtitle +def parse_single_date(value): + """ + Given a single string containing a date in arbitrary format, try to return + tuple (date: datetime.date, month: int, year: int). + """ + if not value: + return None, None, None + if isinstance(value, int): + value = str(value) + parser = dateparser.DateDataParser() + try: + # Results in a dict with keys: date_obj, period, locale. + parse_result = parser.get_date_data(value) + # A datetime object, later we need a date, only. + result = parse_result['date_obj'] + if result is not None: + if parse_result['period'] == 'year': + return None, None, result.year + elif parse_result['period'] == 'month': + return None, result.month, result.year + else: + return result.date(), result.month, result.year + except TypeError as err: + print("{} date parsing failed with: {}".format(value, err), file=sys.stderr) + + return None, None, None def parse_datacite_dates(dates): """ @@ -981,23 +1024,7 @@ def parse_datacite_dates(dates): if result is None: print('fallback for {}'.format(value), file=sys.stderr) - parser = dateparser.DateDataParser() - try: - # Results in a dict with keys: date_obj, period, locale. - parse_result = parser.get_date_data(value) - - # A datetime object, later we need a date, only. - result = parse_result['date_obj'] - if result is not None: - if parse_result['period'] == 'year': - return None, None, result.year - elif parse_result['period'] == 'month': - return None, result.month, result.year - else: - return result.date(), result.month, result.year - except TypeError as err: - print("{} date parsing failed with: {}".format(value, err), - file=sys.stderr) + release_date, release_month, release_year = parse_single_date(value) if result is None: # Unparsable date. diff --git a/python/tests/files/datacite/datacite_doc_30.json b/python/tests/files/datacite/datacite_doc_30.json new file mode 100644 index 00000000..5f851bbb --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_30.json @@ -0,0 +1,72 @@ +{ + "id": "10.17912/micropub.biology.000143", + "type": "dois", + "attributes": { + "doi": "10.17912/micropub.biology.000143", + "identifiers": null, + "creators": [ + { + "raw_name": "Celja J Uebel", + "givenName": "Celja J", + "familyName": "Uebel", + "affiliation": [], + "role": "author" + }, + { + "raw_name": "Carolyn M Phillips", + "givenName": "Carolyn M", + "familyName": "Phillips", + "affiliation": [], + "role": "author" + } + ], + "titles": [ + { + "title": "Phase-separated protein dynamics are affected by fluorescent tag choice" + } + ], + "publisher": "microPublication Biology", + "container": {}, + "publicationYear": 2019, + "subjects": [], + "contributors": [], + "dates": null, + "language": null, + "types": { + "resourceTypeGeneral": "DataPaper" + }, + "relatedIdentifiers": [], + "sizes": [], + "formats": [], + "version": null, + "rightsList": [], + "descriptions": [ + { + "description": "Biological liquid-liquid phase separation", + "descriptionType": "Abstract" + } + ], + "geoLocations": [], + "fundingReferences": [], + "url": "https://www.micropublication.org/journals/biology/micropub.biology.000143", + "contentUrl": null, + "metadataVersion": 0, + "schemaVersion": null, + "source": null, + "isActive": true, + "state": "findable", + "reason": null, + "created": "2019-08-19T14:43:08.000Z", + "registered": "2019-08-19T14:43:09.000Z", + "published": "2019", + "updated": "2019-11-09T12:32:02.000Z" + }, + "relationships": { + "client": { + "data": { + "id": "caltech.micropub", + "type": "clients" + } + } + } +} diff --git a/python/tests/files/datacite/datacite_result_30.json b/python/tests/files/datacite/datacite_result_30.json new file mode 100644 index 00000000..fc2c4dfc --- /dev/null +++ b/python/tests/files/datacite/datacite_result_30.json @@ -0,0 +1,39 @@ +{ + "abstracts": [ + { + "content": "Biological liquid-liquid phase separation", + "lang": "fr", + "mimetype": "text/plain" + } + ], + "contribs": [ + { + "index": 0, + "given_name": "Celja J", + "surname": "Uebel", + "raw_name": "Celja J Uebel", + "role": "author" + }, + { + "index": 1, + "given_name": "Carolyn M", + "raw_name": "Carolyn M Phillips", + "surname": "Phillips", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.17912/micropub.biology.000143" + }, + "extra": { + "datacite": { + "resourceTypeGeneral": "DataPaper" + }, + "container_name": "microPublication Biology" + }, + "refs": [], + "release_stage": "published", + "release_year": 2019, + "publisher": "microPublication Biology", + "title": "Phase-separated protein dynamics are affected by fluorescent tag choice" +} diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 669a6984..15650375 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -287,10 +287,9 @@ def test_datacite_conversions(datacite_importer): for now. """ datacite_importer.debug = True - for i in range(30): + for i in range(31): src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i) dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i) - print('testing mapping from {} => {}'.format(src, dst)) with open(src, 'r') as f: re = datacite_importer.parse_record(json.load(f)) result = entity_to_dict(re) |