From fcc6f24a95a7b77bda4ec813daecc2b737a82412 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 7 Jul 2020 02:08:26 +0200 Subject: datacite: address duplicated contributor issue Use string comparison. * https://fatcat.wiki/release/spjysmrnsrgyzgq6ise5o44rlu/contribs * https://api.datacite.org/dois/10.25940/roper-31098406 --- python/fatcat_tools/importers/datacite.py | 16 ++++++ python/tests/files/datacite/datacite_doc_33.json | 62 ++++++++++++++++++++++ .../tests/files/datacite/datacite_result_05.json | 3 -- .../tests/files/datacite/datacite_result_08.json | 7 --- .../tests/files/datacite/datacite_result_33.json | 31 +++++++++++ python/tests/import_datacite.py | 2 +- 6 files changed, 110 insertions(+), 11 deletions(-) create mode 100644 python/tests/files/datacite/datacite_doc_33.json create mode 100644 python/tests/files/datacite/datacite_result_33.json diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 434a2941..66ec2023 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -298,6 +298,9 @@ class DataciteImporter(EntityImporter): contribs = self.parse_datacite_creators(creators, doi=doi) + self.parse_datacite_creators(contributors, role=None, set_index=False, doi=doi) + # Address duplicated author names; use raw_name string comparison; refs #59. + contribs = unique_contributors(contribs) + # Title, may come with "attributes.titles[].titleType", like # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle" titles = attributes.get('titles', []) or [] @@ -823,6 +826,19 @@ class DataciteImporter(EntityImporter): return contribs +def unique_contributors(contribs): + """ + Given a list of ReleaseContrib items, return a list of unique + ReleaseContribs, refs GH #59. + """ + unique_names, unique_contribs = set(), [] + for rc in contribs: + if rc.raw_name and rc.raw_name in unique_names: + continue + unique_names.add(rc.raw_name) + unique_contribs.append(rc) + return unique_contribs + def lookup_license_slug(raw): """ Resolve a variety of strings into a some pseudo-canonical form, e.g. diff --git a/python/tests/files/datacite/datacite_doc_33.json b/python/tests/files/datacite/datacite_doc_33.json new file mode 100644 index 00000000..571d1220 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_33.json @@ -0,0 +1,62 @@ +{ + "id": "10.17912/micropub.biology.000143", + "type": "dois", + "attributes": { + "doi": "10.17912/micropub.biology.000143", + "identifiers": null, + "creators": [ + { + "name": "ABC News", + "givenName": "", + "familyName": "", + "affiliation": [], + "role": "author" + } + ], + "titles": [ + { + "title": "Sample" + } + ], + "publisher": "microPublication Biology", + "publicationYear": 2019, + "types": { + "resourceTypeGeneral": "DataPaper" + }, + "relatedIdentifiers": [], + "sizes": [], + "formats": [], + "version": null, + "rightsList": [], + "descriptions": [ + { + "description": 1234567890, + "descriptionType": "Abstract" + } + ], + "geoLocations": [], + "fundingReferences": [], + "url": "https://www.micropublication.org/journals/biology/micropub.biology.000143", + "created": "2019-08-19T14:43:08.000Z", + "registered": "2019-08-19T14:43:09.000Z", + "published": "2019", + "updated": "2019-11-09T12:32:02.000Z", + "contributors": [ + { + "name": "ABC News", + "givenName": "", + "familyName": "", + "affiliation": [], + "role": "" + } + ] + }, + "relationships": { + "client": { + "data": { + "id": "caltech.micropub", + "type": "clients" + } + } + } +} diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json index 79c2a8fb..d634490d 100644 --- a/python/tests/files/datacite/datacite_result_05.json +++ b/python/tests/files/datacite/datacite_result_05.json @@ -504,9 +504,6 @@ "role": "author", "surname": "Wurzbacher" }, - { - "raw_name": "Kessy Abarenkov" - }, { "raw_name": "NHM UT-University Of Tartu; Natural History Museum And Botanic Garden" } diff --git a/python/tests/files/datacite/datacite_result_08.json b/python/tests/files/datacite/datacite_result_08.json index 70237280..5a46ef50 100644 --- a/python/tests/files/datacite/datacite_result_08.json +++ b/python/tests/files/datacite/datacite_result_08.json @@ -13,13 +13,6 @@ "raw_name": "Kei Kajisa", "role": "author", "surname": "Kajisa" - }, - { - "given_name": "Kei", - "index": 1, - "raw_name": "Kei Kajisa", - "role": "author", - "surname": "Kajisa" } ], "ext_ids": { diff --git a/python/tests/files/datacite/datacite_result_33.json b/python/tests/files/datacite/datacite_result_33.json new file mode 100644 index 00000000..bcb72469 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_33.json @@ -0,0 +1,31 @@ +{ + "abstracts": [ + { + "content": "1234567890", + "mimetype": "text/plain" + } + ], + "contribs": [ + { + "given_name": "", + "surname": "", + "index": 0, + "raw_name": "ABC News", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.17912/micropub.biology.000143" + }, + "extra": { + "datacite": { + "resourceTypeGeneral": "DataPaper" + }, + "container_name": "microPublication Biology" + }, + "refs": [], + "release_stage": "published", + "release_year": 2019, + "publisher": "microPublication Biology", + "title": "Sample" +} diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 20c1eaf8..1472b8ea 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -288,7 +288,7 @@ def test_datacite_conversions(datacite_importer): for now. """ datacite_importer.debug = True - for i in range(33): + for i in range(34): src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i) dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i) with open(src, 'r') as f: -- cgit v1.2.3