From fcc6f24a95a7b77bda4ec813daecc2b737a82412 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 7 Jul 2020 02:08:26 +0200 Subject: datacite: address duplicated contributor issue Use string comparison. * https://fatcat.wiki/release/spjysmrnsrgyzgq6ise5o44rlu/contribs * https://api.datacite.org/dois/10.25940/roper-31098406 --- python/fatcat_tools/importers/datacite.py | 16 ++++++ python/tests/files/datacite/datacite_doc_33.json | 62 ++++++++++++++++++++++ .../tests/files/datacite/datacite_result_05.json | 3 -- .../tests/files/datacite/datacite_result_08.json | 7 --- .../tests/files/datacite/datacite_result_33.json | 31 +++++++++++ python/tests/import_datacite.py | 2 +- 6 files changed, 110 insertions(+), 11 deletions(-) create mode 100644 python/tests/files/datacite/datacite_doc_33.json create mode 100644 python/tests/files/datacite/datacite_result_33.json diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 434a2941..66ec2023 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -298,6 +298,9 @@ class DataciteImporter(EntityImporter): contribs = self.parse_datacite_creators(creators, doi=doi) + self.parse_datacite_creators(contributors, role=None, set_index=False, doi=doi) + # Address duplicated author names; use raw_name string comparison; refs #59. + contribs = unique_contributors(contribs) + # Title, may come with "attributes.titles[].titleType", like # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle" titles = attributes.get('titles', []) or [] @@ -823,6 +826,19 @@ class DataciteImporter(EntityImporter): return contribs +def unique_contributors(contribs): + """ + Given a list of ReleaseContrib items, return a list of unique + ReleaseContribs, refs GH #59. + """ + unique_names, unique_contribs = set(), [] + for rc in contribs: + if rc.raw_name and rc.raw_name in unique_names: + continue + unique_names.add(rc.raw_name) + unique_contribs.append(rc) + return unique_contribs + def lookup_license_slug(raw): """ Resolve a variety of strings into a some pseudo-canonical form, e.g. diff --git a/python/tests/files/datacite/datacite_doc_33.json b/python/tests/files/datacite/datacite_doc_33.json new file mode 100644 index 00000000..571d1220 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_33.json @@ -0,0 +1,62 @@ +{ + "id": "10.17912/micropub.biology.000143", + "type": "dois", + "attributes": { + "doi": "10.17912/micropub.biology.000143", + "identifiers": null, + "creators": [ + { + "name": "ABC News", + "givenName": "", + "familyName": "", + "affiliation": [], + "role": "author" + } + ], + "titles": [ + { + "title": "Sample" + } + ], + "publisher": "microPublication Biology", + "publicationYear": 2019, + "types": { + "resourceTypeGeneral": "DataPaper" + }, + "relatedIdentifiers": [], + "sizes": [], + "formats": [], + "version": null, + "rightsList": [], + "descriptions": [ + { + "description": 1234567890, + "descriptionType": "Abstract" + } + ], + "geoLocations": [], + "fundingReferences": [], + "url": "https://www.micropublication.org/journals/biology/micropub.biology.000143", + "created": "2019-08-19T14:43:08.000Z", + "registered": "2019-08-19T14:43:09.000Z", + "published": "2019", + "updated": "2019-11-09T12:32:02.000Z", + "contributors": [ + { + "name": "ABC News", + "givenName": "", + "familyName": "", + "affiliation": [], + "role": "" + } + ] + }, + "relationships": { + "client": { + "data": { + "id": "caltech.micropub", + "type": "clients" + } + } + } +} diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json index 79c2a8fb..d634490d 100644 --- a/python/tests/files/datacite/datacite_result_05.json +++ b/python/tests/files/datacite/datacite_result_05.json @@ -504,9 +504,6 @@ "role": "author", "surname": "Wurzbacher" }, - { - "raw_name": "Kessy Abarenkov" - }, { "raw_name": "NHM UT-University Of Tartu; Natural History Museum And Botanic Garden" } diff --git a/python/tests/files/datacite/datacite_result_08.json b/python/tests/files/datacite/datacite_result_08.json index 70237280..5a46ef50 100644 --- a/python/tests/files/datacite/datacite_result_08.json +++ b/python/tests/files/datacite/datacite_result_08.json @@ -13,13 +13,6 @@ "raw_name": "Kei Kajisa", "role": "author", "surname": "Kajisa" - }, - { - "given_name": "Kei", - "index": 1, - "raw_name": "Kei Kajisa", - "role": "author", - "surname": "Kajisa" } ], "ext_ids": { diff --git a/python/tests/files/datacite/datacite_result_33.json b/python/tests/files/datacite/datacite_result_33.json new file mode 100644 index 00000000..bcb72469 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_33.json @@ -0,0 +1,31 @@ +{ + "abstracts": [ + { + "content": "1234567890", + "mimetype": "text/plain" + } + ], + "contribs": [ + { + "given_name": "", + "surname": "", + "index": 0, + "raw_name": "ABC News", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.17912/micropub.biology.000143" + }, + "extra": { + "datacite": { + "resourceTypeGeneral": "DataPaper" + }, + "container_name": "microPublication Biology" + }, + "refs": [], + "release_stage": "published", + "release_year": 2019, + "publisher": "microPublication Biology", + "title": "Sample" +} diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 20c1eaf8..1472b8ea 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -288,7 +288,7 @@ def test_datacite_conversions(datacite_importer): for now. """ datacite_importer.debug = True - for i in range(33): + for i in range(34): src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i) dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i) with open(src, 'r') as f: -- cgit v1.2.3 From 40f77b78aa331ca67b510dfece77e6a6000f8c2f Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 10 Jul 2020 00:50:34 +0200 Subject: wip: contrib, GH59 --- python/tests/files/datacite/datacite_doc_34.json | 61 ++++++++++++++++++++++ .../tests/files/datacite/datacite_result_05.json | 3 +- .../tests/files/datacite/datacite_result_09.json | 3 +- .../tests/files/datacite/datacite_result_26.json | 3 +- .../tests/files/datacite/datacite_result_34.json | 38 ++++++++++++++ 5 files changed, 105 insertions(+), 3 deletions(-) create mode 100644 python/tests/files/datacite/datacite_doc_34.json create mode 100644 python/tests/files/datacite/datacite_result_34.json diff --git a/python/tests/files/datacite/datacite_doc_34.json b/python/tests/files/datacite/datacite_doc_34.json new file mode 100644 index 00000000..5dcf65f4 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_34.json @@ -0,0 +1,61 @@ +{ + "id": "10.17912/micropub.biology.000143", + "type": "dois", + "attributes": { + "doi": "10.17912/micropub.biology.000143", + "identifiers": null, + "creators": [ + { + "name": "Paul Katz", + "givenName": "", + "familyName": "", + "affiliation": [], + "role": "author" + } + ], + "titles": [ + { + "title": "Sample" + } + ], + "publisher": "microPublication Biology", + "publicationYear": 2019, + "types": { + "resourceTypeGeneral": "DataPaper" + }, + "relatedIdentifiers": [], + "sizes": [], + "formats": [], + "version": null, + "rightsList": [], + "descriptions": [ + { + "description": 1234567890, + "descriptionType": "Abstract" + } + ], + "geoLocations": [], + "fundingReferences": [], + "url": "https://www.micropublication.org/journals/biology/micropub.biology.000143", + "created": "2019-08-19T14:43:08.000Z", + "registered": "2019-08-19T14:43:09.000Z", + "published": "2019", + "updated": "2019-11-09T12:32:02.000Z", + "contributors": [ + { + "name": "Paul Katz", + "givenName": "", + "familyName": "", + "affiliation": [], + "role": "illustrator" + } ] + }, + "relationships": { + "client": { + "data": { + "id": "caltech.micropub", + "type": "clients" + } + } + } +} diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json index d634490d..c91f3a7f 100644 --- a/python/tests/files/datacite/datacite_result_05.json +++ b/python/tests/files/datacite/datacite_result_05.json @@ -505,7 +505,8 @@ "surname": "Wurzbacher" }, { - "raw_name": "NHM UT-University Of Tartu; Natural History Museum And Botanic Garden" + "raw_name": "NHM UT-University Of Tartu; Natural History Museum And Botanic Garden", + "role": "author" } ], "ext_ids": { diff --git a/python/tests/files/datacite/datacite_result_09.json b/python/tests/files/datacite/datacite_result_09.json index 09e02fc7..f6ec524a 100644 --- a/python/tests/files/datacite/datacite_result_09.json +++ b/python/tests/files/datacite/datacite_result_09.json @@ -17,7 +17,8 @@ "extra": { "type": "DataManager" }, - "raw_name": "Technische Informationsbibliothek (TIB)" + "raw_name": "Technische Informationsbibliothek (TIB)", + "role": "author" } ], "ext_ids": { diff --git a/python/tests/files/datacite/datacite_result_26.json b/python/tests/files/datacite/datacite_result_26.json index 267eb9c2..f6e589ef 100644 --- a/python/tests/files/datacite/datacite_result_26.json +++ b/python/tests/files/datacite/datacite_result_26.json @@ -13,7 +13,8 @@ }, "given_name": "David", "raw_name": "David Wemmer", - "surname": "Wemmer" + "surname": "Wemmer", + "role": "author" } ], "ext_ids": { diff --git a/python/tests/files/datacite/datacite_result_34.json b/python/tests/files/datacite/datacite_result_34.json new file mode 100644 index 00000000..8e087ab5 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_34.json @@ -0,0 +1,38 @@ +{ + "abstracts": [ + { + "content": "1234567890", + "mimetype": "text/plain" + } + ], + "contribs": [ + { + "given_name": "", + "surname": "", + "index": 0, + "raw_name": "Paul Katz", + "role": "author" + }, + { + "given_name": "", + "surname": "", + "index": 0, + "raw_name": "Paul Katz", + "role": "illustrator" + } + ], + "ext_ids": { + "doi": "10.17912/micropub.biology.000143" + }, + "extra": { + "datacite": { + "resourceTypeGeneral": "DataPaper" + }, + "container_name": "microPublication Biology" + }, + "refs": [], + "release_stage": "published", + "release_year": 2019, + "publisher": "microPublication Biology", + "title": "Sample" +} -- cgit v1.2.3 From df8dcde8d5eaf530e35f1467951271bff7475e64 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 10 Jul 2020 00:50:42 +0200 Subject: wip: contrib, GH59 --- python/fatcat_tools/importers/datacite.py | 38 +- python/tests/import_datacite.py | 590 ++++++++++++++++++------------ 2 files changed, 383 insertions(+), 245 deletions(-) diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 66ec2023..7797812f 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -292,14 +292,17 @@ class DataciteImporter(EntityImporter): print('[{}] skipping non-ascii doi for now'.format(doi)) return None - creators = attributes.get('creators', []) or [] contributors = attributes.get('contributors', []) or [] # Much fewer than creators. - contribs = self.parse_datacite_creators(creators, doi=doi) + self.parse_datacite_creators(contributors, role=None, set_index=False, doi=doi) + contribs = self.parse_datacite_creators(creators, doi=doi) + contribs_extra_contributors = self.parse_datacite_creators(contributors, set_index=False, doi=doi) - # Address duplicated author names; use raw_name string comparison; refs #59. - contribs = unique_contributors(contribs) + # Unfortunately, creators and contributors might overlap, refs GH59. + for cc in contribs_extra_contributors: + if contributor_list_contains_contributor(contribs, cc): + continue + contribs.append(cc) # Title, may come with "attributes.titles[].titleType", like # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle" @@ -800,8 +803,7 @@ class DataciteImporter(EntityImporter): if contributorType: extra = {'type': contributorType} - contribs.append( - fatcat_openapi_client.ReleaseContrib( + rc = fatcat_openapi_client.ReleaseContrib( creator_id=creator_id, index=i, raw_name=name, @@ -810,7 +812,9 @@ class DataciteImporter(EntityImporter): role=role, raw_affiliation=raw_affiliation, extra=extra, - )) + ) + if not contributor_list_contains_contributor(contribs, rc): + contribs.append(rc) elif nameType == 'Organizational': name = c.get('name', '') or '' if name in UNKNOWN_MARKERS: @@ -826,18 +830,20 @@ class DataciteImporter(EntityImporter): return contribs -def unique_contributors(contribs): +def contributor_list_contains_contributor(contributor_list, contributor): """ - Given a list of ReleaseContrib items, return a list of unique - ReleaseContribs, refs GH #59. + Given a list of contributors, determine, whether contrib is in that list. """ - unique_names, unique_contribs = set(), [] - for rc in contribs: - if rc.raw_name and rc.raw_name in unique_names: + for cc in contributor_list: + if cc.raw_name != contributor.raw_name: + continue + cc_role = cc.role or 'author' + contributor_role = contributor.role or 'author' + if cc_role != contributor_role: continue - unique_names.add(rc.raw_name) - unique_contribs.append(rc) - return unique_contribs + return True + return False + def lookup_license_slug(raw): """ diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 1472b8ea..b01a11e6 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -7,33 +7,54 @@ import datetime import pytest import gzip from fatcat_tools.importers import DataciteImporter, JsonLinePusher -from fatcat_tools.importers.datacite import find_original_language_title, parse_datacite_titles, parse_datacite_dates, clean_doi, index_form_to_display_name, lookup_license_slug +from fatcat_tools.importers.datacite import ( + find_original_language_title, + parse_datacite_titles, + parse_datacite_dates, + clean_doi, + index_form_to_display_name, + lookup_license_slug, + contributor_list_contains_contributor, +) from fatcat_tools.transforms import entity_to_dict +import fatcat_openapi_client from fixtures import api import json @pytest.fixture(scope="function") def datacite_importer(api): - with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: - yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', - bezerk_mode=True) + with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: + yield DataciteImporter( + api, + issn_file, + extid_map_file="tests/files/example_map.sqlite3", + bezerk_mode=True, + ) + @pytest.fixture(scope="function") def datacite_importer_existing(api): - with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: - yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', - bezerk_mode=False) + with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: + yield DataciteImporter( + api, + issn_file, + extid_map_file="tests/files/example_map.sqlite3", + bezerk_mode=False, + ) + @pytest.mark.skip(reason="larger datacite import slows tests down") def test_datacite_importer_huge(datacite_importer): last_index = datacite_importer.api.get_changelog(limit=1)[0].index - with gzip.open('tests/files/datacite_1k_records.jsonl.gz', 'rt') as f: + with gzip.open("tests/files/datacite_1k_records.jsonl.gz", "rt") as f: datacite_importer.bezerk_mode = True counts = JsonLinePusher(datacite_importer, f).run() - assert counts['insert'] == 998 - change = datacite_importer.api.get_changelog_entry(index=last_index+1) - release = datacite_importer.api.get_release(change.editgroup.edits.releases[0].ident) + assert counts["insert"] == 998 + change = datacite_importer.api.get_changelog_entry(index=last_index + 1) + release = datacite_importer.api.get_release( + change.editgroup.edits.releases[0].ident + ) assert len(release.contribs) == 3 @@ -41,122 +62,161 @@ def test_find_original_language_title(): """ Original language might be included, in various ways. """ - Case = collections.namedtuple('Case', 'about input result') + Case = collections.namedtuple("Case", "about input result") cases = [ - Case('defaults to None', {}, None), - Case('ignore unknown keys', {'broken': 'kv'}, None), - Case('just a title', {'title': 'Noise Reduction'}, None), - Case('same title should be ignored', { - 'title': 'Noise Reduction', - 'original_language_title': 'Noise Reduction' - }, None), - Case('empty subdict is ignored', { - 'title': 'Noise Reduction', - 'original_language_title': {}, - }, None), - Case('unknown subdict keys are ignored', { - 'title': 'Noise Reduction', - 'original_language_title': {'broken': 'kv'}, - }, None), - Case('original string', { - 'title': 'Noise Reduction', - 'original_language_title': 'Подавление шума', - }, 'Подавление шума'), - Case('language tag is ignored, since its broken', { - 'title': 'Noise Reduction', - 'original_language_title': { - 'language': 'ja', - '__content__': 'Noise Reduction' + Case("defaults to None", {}, None), + Case("ignore unknown keys", {"broken": "kv"}, None), + Case("just a title", {"title": "Noise Reduction"}, None), + Case( + "same title should be ignored", + {"title": "Noise Reduction", "original_language_title": "Noise Reduction"}, + None, + ), + Case( + "empty subdict is ignored", + {"title": "Noise Reduction", "original_language_title": {},}, + None, + ), + Case( + "unknown subdict keys are ignored", + {"title": "Noise Reduction", "original_language_title": {"broken": "kv"},}, + None, + ), + Case( + "original string", + {"title": "Noise Reduction", "original_language_title": "Подавление шума",}, + "Подавление шума", + ), + Case( + "language tag is ignored, since its broken", + { + "title": "Noise Reduction", + "original_language_title": { + "language": "ja", + "__content__": "Noise Reduction", + }, }, - }, None), - Case('do not care about language', { - 'title': 'Noise Reduction', - 'original_language_title': { - 'language': 'ja', - '__content__': 'Rauschunterdrückung', + None, + ), + Case( + "do not care about language", + { + "title": "Noise Reduction", + "original_language_title": { + "language": "ja", + "__content__": "Rauschunterdrückung", + }, }, - }, 'Rauschunterdrückung'), - Case('ignore excessive questionmarks', { - 'title': 'Noise Reduction', - 'original_language_title': { - 'language': 'ja', - '__content__': '???? However', + "Rauschunterdrückung", + ), + Case( + "ignore excessive questionmarks", + { + "title": "Noise Reduction", + "original_language_title": { + "language": "ja", + "__content__": "???? However", + }, }, - }, None), + None, + ), ] for case in cases: result = find_original_language_title(case.input) assert result == case.result + def test_parse_datacite_titles(): """ Given a list of titles, find title, original_language_title and subtitle. Result is a 3-tuple of title, original_language_title, subtitle. """ - Case = collections.namedtuple('Case', 'about input result') + Case = collections.namedtuple("Case", "about input result") cases = [ - Case('handle None', None, (None, None, None)), - Case('empty list', [], (None, None, None)), - Case('empty item', [{}], (None, None, None)), - Case('broken keys', [{'broken': 'kv'}], (None, None, None)), - Case('title only', [{'title': 'Total carbon dioxide'}], - ('Total carbon dioxide', None, None), - ), - Case('title and subtitle', [ - {'title': 'Total carbon dioxide'}, - {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, - ], - ('Total carbon dioxide', None, 'Station TT043_7-9'), - ), - Case('title, subtitle order does not matter', [ - {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, - {'title': 'Total carbon dioxide'}, - ], - ('Total carbon dioxide', None, 'Station TT043_7-9'), - ), - Case('multiple titles, first wins', [ - {'title': 'Total carbon dioxide'}, - {'title': 'Meeting Heterogeneity'}, - ], - ('Total carbon dioxide', None, None), - ), - Case('multiple titles, plus sub', [ - {'title': 'Total carbon dioxide'}, - {'title': 'Meeting Heterogeneity'}, - {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, - ], - ('Total carbon dioxide', None, 'Station TT043_7-9'), - ), - Case('multiple titles, multiple subs', [ - {'title': 'Total carbon dioxide'}, - {'title': 'Meeting Heterogeneity'}, - {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, - {'title': 'Some other subtitle', 'titleType': 'Subtitle'}, - ], - ('Total carbon dioxide', None, 'Station TT043_7-9'), - ), - Case('title, original, sub', [ - {'title': 'Total carbon dioxide', 'original_language_title': 'Всего углекислого газа'}, - {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, - ], - ('Total carbon dioxide', 'Всего углекислого газа', 'Station TT043_7-9'), - ), - Case('title, original same as title, sub', [ - {'title': 'Total carbon dioxide', 'original_language_title': { - '__content__': 'Total carbon dioxide', - }}, - {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, - ], - ('Total carbon dioxide', None, 'Station TT043_7-9'), - ), - Case('title, original dict, sub', [ - {'title': 'Total carbon dioxide', 'original_language_title': { - '__content__': 'Всего углекислого газа', - }}, - {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, - ], - ('Total carbon dioxide', 'Всего углекислого газа', 'Station TT043_7-9'), + Case("handle None", None, (None, None, None)), + Case("empty list", [], (None, None, None)), + Case("empty item", [{}], (None, None, None)), + Case("broken keys", [{"broken": "kv"}], (None, None, None)), + Case( + "title only", + [{"title": "Total carbon dioxide"}], + ("Total carbon dioxide", None, None), + ), + Case( + "title and subtitle", + [ + {"title": "Total carbon dioxide"}, + {"title": "Station TT043_7-9", "titleType": "Subtitle"}, + ], + ("Total carbon dioxide", None, "Station TT043_7-9"), + ), + Case( + "title, subtitle order does not matter", + [ + {"title": "Station TT043_7-9", "titleType": "Subtitle"}, + {"title": "Total carbon dioxide"}, + ], + ("Total carbon dioxide", None, "Station TT043_7-9"), + ), + Case( + "multiple titles, first wins", + [{"title": "Total carbon dioxide"}, {"title": "Meeting Heterogeneity"},], + ("Total carbon dioxide", None, None), + ), + Case( + "multiple titles, plus sub", + [ + {"title": "Total carbon dioxide"}, + {"title": "Meeting Heterogeneity"}, + {"title": "Station TT043_7-9", "titleType": "Subtitle"}, + ], + ("Total carbon dioxide", None, "Station TT043_7-9"), + ), + Case( + "multiple titles, multiple subs", + [ + {"title": "Total carbon dioxide"}, + {"title": "Meeting Heterogeneity"}, + {"title": "Station TT043_7-9", "titleType": "Subtitle"}, + {"title": "Some other subtitle", "titleType": "Subtitle"}, + ], + ("Total carbon dioxide", None, "Station TT043_7-9"), + ), + Case( + "title, original, sub", + [ + { + "title": "Total carbon dioxide", + "original_language_title": "Всего углекислого газа", + }, + {"title": "Station TT043_7-9", "titleType": "Subtitle"}, + ], + ("Total carbon dioxide", "Всего углекислого газа", "Station TT043_7-9"), + ), + Case( + "title, original same as title, sub", + [ + { + "title": "Total carbon dioxide", + "original_language_title": {"__content__": "Total carbon dioxide",}, + }, + {"title": "Station TT043_7-9", "titleType": "Subtitle"}, + ], + ("Total carbon dioxide", None, "Station TT043_7-9"), + ), + Case( + "title, original dict, sub", + [ + { + "title": "Total carbon dioxide", + "original_language_title": { + "__content__": "Всего углекислого газа", + }, + }, + {"title": "Station TT043_7-9", "titleType": "Subtitle"}, + ], + ("Total carbon dioxide", "Всего углекислого газа", "Station TT043_7-9"), ), ] @@ -164,91 +224,128 @@ def test_parse_datacite_titles(): result = parse_datacite_titles(case.input) assert result == case.result, case.about + def test_parse_datacite_dates(): """ Test datacite date parsing. """ - Case = collections.namedtuple('Case', 'about input result') + Case = collections.namedtuple("Case", "about input result") cases = [ - Case('None is None', None, (None, None, None)), - Case('empty list is None', [], (None, None, None)), - Case('empty item is None', [{}], (None, None, None)), - Case('year only yields year only', [{'date': '2019'}], (None, None, 2019)), - Case('int year', [{'date': 2019}], (None, None, 2019)), - Case('first wins', [{'date': '2019'}, {'date': '2020'}], (None, None, 2019)), - Case('skip bogus year', [{'date': 'abc'}, {'date': '2020'}], (None, None, 2020)), - Case('first with type', [ - {'date': '2019', 'dateType': 'Accepted'}, {'date': '2020'} - ], (None, None, 2019)), - Case('full date', [ - {'date': '2019-12-01', 'dateType': 'Valid'}, - ], (datetime.date(2019, 12, 1), 12, 2019)), - Case('date type prio', [ - {'date': '2000-12-01', 'dateType': 'Valid'}, - {'date': '2010-01-01', 'dateType': 'Updated'}, - ], (datetime.date(2000, 12, 1), 12, 2000)), - Case('date type prio, Available > Updated', [ - {'date': '2010-01-01', 'dateType': 'Updated'}, - {'date': '2000-12-01', 'dateType': 'Available'}, - ], (datetime.date(2000, 12, 1), 12, 2000)), - Case('allow different date formats, Available > Updated', [ - {'date': '2010-01-01T10:00:00', 'dateType': 'Updated'}, - {'date': '2000-12-01T10:00:00', 'dateType': 'Available'}, - ], (datetime.date(2000, 12, 1), 12, 2000)), - Case('allow different date formats, Available > Updated', [ - {'date': '2010-01-01T10:00:00Z', 'dateType': 'Updated'}, - {'date': '2000-12-01T10:00:00Z', 'dateType': 'Available'}, - ], (datetime.date(2000, 12, 1), 12, 2000)), - Case('allow fuzzy date formats, Available > Updated', [ - {'date': '2010', 'dateType': 'Updated'}, - {'date': '2000 Dec 01', 'dateType': 'Available'}, - ], (datetime.date(2000, 12, 1), 12, 2000)), - Case('fuzzy year only', [ - {'date': 'Year 2010', 'dateType': 'Issued'}, - ], (None, None, 2010)), - Case('fuzzy year and month', [ - {'date': 'Year 2010 Feb', 'dateType': 'Issued'}, - ], (None, 2, 2010)), - Case('fuzzy year, month, day', [ - {'date': 'Year 2010 Feb 24', 'dateType': 'Issued'}, - ], (datetime.date(2010, 2, 24), 2, 2010)), - Case('ignore broken date', [ - {'date': 'Febrrr 45', 'dateType': 'Updated'}, - ], (None, None, None)), + Case("None is None", None, (None, None, None)), + Case("empty list is None", [], (None, None, None)), + Case("empty item is None", [{}], (None, None, None)), + Case("year only yields year only", [{"date": "2019"}], (None, None, 2019)), + Case("int year", [{"date": 2019}], (None, None, 2019)), + Case("first wins", [{"date": "2019"}, {"date": "2020"}], (None, None, 2019)), + Case( + "skip bogus year", [{"date": "abc"}, {"date": "2020"}], (None, None, 2020) + ), + Case( + "first with type", + [{"date": "2019", "dateType": "Accepted"}, {"date": "2020"}], + (None, None, 2019), + ), + Case( + "full date", + [{"date": "2019-12-01", "dateType": "Valid"},], + (datetime.date(2019, 12, 1), 12, 2019), + ), + Case( + "date type prio", + [ + {"date": "2000-12-01", "dateType": "Valid"}, + {"date": "2010-01-01", "dateType": "Updated"}, + ], + (datetime.date(2000, 12, 1), 12, 2000), + ), + Case( + "date type prio, Available > Updated", + [ + {"date": "2010-01-01", "dateType": "Updated"}, + {"date": "2000-12-01", "dateType": "Available"}, + ], + (datetime.date(2000, 12, 1), 12, 2000), + ), + Case( + "allow different date formats, Available > Updated", + [ + {"date": "2010-01-01T10:00:00", "dateType": "Updated"}, + {"date": "2000-12-01T10:00:00", "dateType": "Available"}, + ], + (datetime.date(2000, 12, 1), 12, 2000), + ), + Case( + "allow different date formats, Available > Updated", + [ + {"date": "2010-01-01T10:00:00Z", "dateType": "Updated"}, + {"date": "2000-12-01T10:00:00Z", "dateType": "Available"}, + ], + (datetime.date(2000, 12, 1), 12, 2000), + ), + Case( + "allow fuzzy date formats, Available > Updated", + [ + {"date": "2010", "dateType": "Updated"}, + {"date": "2000 Dec 01", "dateType": "Available"}, + ], + (datetime.date(2000, 12, 1), 12, 2000), + ), + Case( + "fuzzy year only", + [{"date": "Year 2010", "dateType": "Issued"},], + (None, None, 2010), + ), + Case( + "fuzzy year and month", + [{"date": "Year 2010 Feb", "dateType": "Issued"},], + (None, 2, 2010), + ), + Case( + "fuzzy year, month, day", + [{"date": "Year 2010 Feb 24", "dateType": "Issued"},], + (datetime.date(2010, 2, 24), 2, 2010), + ), + Case( + "ignore broken date", + [{"date": "Febrrr 45", "dateType": "Updated"},], + (None, None, None), + ), ] for case in cases: result = parse_datacite_dates(case.input) assert result == case.result, case.about + def test_datacite_importer(datacite_importer): last_index = datacite_importer.api.get_changelog(limit=1)[0].index - with open('tests/files/datacite_sample.jsonl', 'r') as f: + with open("tests/files/datacite_sample.jsonl", "r") as f: datacite_importer.bezerk_mode = True counts = JsonLinePusher(datacite_importer, f).run() - assert counts['insert'] == 1 - assert counts['exists'] == 0 - assert counts['skip'] == 0 + assert counts["insert"] == 1 + assert counts["exists"] == 0 + assert counts["skip"] == 0 # fetch most recent editgroup - change = datacite_importer.api.get_changelog_entry(index=last_index+1) + change = datacite_importer.api.get_changelog_entry(index=last_index + 1) eg = change.editgroup assert eg.description assert "datacite" in eg.description.lower() - assert eg.extra['git_rev'] - assert "fatcat_tools.DataciteImporter" in eg.extra['agent'] + assert eg.extra["git_rev"] + assert "fatcat_tools.DataciteImporter" in eg.extra["agent"] last_index = datacite_importer.api.get_changelog(limit=1)[0].index - with open('tests/files/datacite_sample.jsonl', 'r') as f: + with open("tests/files/datacite_sample.jsonl", "r") as f: datacite_importer.bezerk_mode = False datacite_importer.reset() counts = JsonLinePusher(datacite_importer, f).run() - assert counts['insert'] == 0 - assert counts['exists'] == 1 - assert counts['skip'] == 0 + assert counts["insert"] == 0 + assert counts["exists"] == 1 + assert counts["skip"] == 0 assert last_index == datacite_importer.api.get_changelog(limit=1)[0].index + def test_datacite_dict_parse(datacite_importer): - with open('tests/files/datacite_sample.jsonl', 'r') as f: + with open("tests/files/datacite_sample.jsonl", "r") as f: raw = json.load(f) r = datacite_importer.parse_record(raw) # ensure the API server is ok with format @@ -256,7 +353,9 @@ def test_datacite_dict_parse(datacite_importer): print(r.extra) assert r.title == "Triticum turgidum L. subsp. durum (Desf.) Husn. 97090" - assert r.publisher == "International Centre for Agricultural Research in Dry Areas" + assert ( + r.publisher == "International Centre for Agricultural Research in Dry Areas" + ) assert r.release_type == "article" assert r.release_stage == "published" assert r.license_slug == None @@ -267,13 +366,15 @@ def test_datacite_dict_parse(datacite_importer): assert r.subtitle == None assert r.release_date == None assert r.release_year == 1986 - assert 'subtitle' not in r.extra - assert 'subtitle' not in r.extra['datacite'] - assert 'funder' not in r.extra - assert 'funder' not in r.extra['datacite'] + assert "subtitle" not in r.extra + assert "subtitle" not in r.extra["datacite"] + assert "funder" not in r.extra + assert "funder" not in r.extra["datacite"] # matched by ISSN, so shouldn't be in there - #assert extra['container_name'] == "International Journal of Quantum Chemistry" - assert r.extra['datacite']['subjects'] == [{'subject': 'Plant Genetic Resource for Food and Agriculture'}] + # assert extra['container_name'] == "International Journal of Quantum Chemistry" + assert r.extra["datacite"]["subjects"] == [ + {"subject": "Plant Genetic Resource for Food and Agriculture"} + ] assert len(r.abstracts) == 1 assert len(r.abstracts[0].content) == 421 assert len(r.contribs) == 2 @@ -282,34 +383,41 @@ def test_datacite_dict_parse(datacite_importer): assert r.contribs[0].surname == None assert len(r.refs) == 0 + def test_datacite_conversions(datacite_importer): """ Datacite JSON to release entity JSON representation. The count is hardcoded for now. """ datacite_importer.debug = True - for i in range(34): - src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i) - dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i) - with open(src, 'r') as f: + for i in range(35): + src = "tests/files/datacite/datacite_doc_{0:02d}.json".format(i) + dst = "tests/files/datacite/datacite_result_{0:02d}.json".format(i) + with open(src, "r") as f: re = datacite_importer.parse_record(json.load(f)) result = entity_to_dict(re) - with open(dst, 'r') as f: - expected = json.loads(f.read()) + with open(dst, "r") as f: + expected = json.loads(f.read()) + + assert result == expected, "output mismatch in {}".format(dst) - assert result == expected, 'output mismatch in {}'.format(dst) def test_index_form_to_display_name(): - Case = collections.namedtuple('Case', 'input output') + Case = collections.namedtuple("Case", "input output") cases = [ - Case('', ''), - Case('ABC', 'ABC'), - Case('International Space Station', 'International Space Station'), - Case('Jin, Shan', 'Shan Jin'), - Case('Volkshochschule Der Bundesstadt Bonn', 'Volkshochschule Der Bundesstadt Bonn'), - Case('Solomon, P. M.', 'P. M. Solomon'), - Case('Sujeevan Ratnasingham', 'Sujeevan Ratnasingham'), - Case('Paul Stöckli (1906-1991), Künstler', 'Paul Stöckli (1906-1991), Künstler'), + Case("", ""), + Case("ABC", "ABC"), + Case("International Space Station", "International Space Station"), + Case("Jin, Shan", "Shan Jin"), + Case( + "Volkshochschule Der Bundesstadt Bonn", + "Volkshochschule Der Bundesstadt Bonn", + ), + Case("Solomon, P. M.", "P. M. Solomon"), + Case("Sujeevan Ratnasingham", "Sujeevan Ratnasingham"), + Case( + "Paul Stöckli (1906-1991), Künstler", "Paul Stöckli (1906-1991), Künstler" + ), ] for c in cases: @@ -317,45 +425,69 @@ def test_index_form_to_display_name(): def test_lookup_license_slug(): - Case = collections.namedtuple('Case', 'input output') + Case = collections.namedtuple("Case", "input output") cases = [ - Case('https://opensource.org/licenses/MIT', 'MIT'), - Case('creativecommons.org/licenses/by-nc-nd/3.0/', 'CC-BY-NC-ND'), - Case('http://creativecommons.org/licences/by-nc-sa/4.0', 'CC-BY-NC-SA'), - Case('http://creativecommons.org/licenses/by-nc-nd/2.5/co', 'CC-BY-NC-ND'), - Case('http://creativecommons.org/licenses/by-nd/4.0/legalcode', 'CC-BY-ND'), - Case('http://creativecommons.org/licenses/by/2.0/uk/legalcode', 'CC-BY'), - Case('http://creativecommons.org/publicdomain/zero/1.0/legalcode', 'CC-0'), - Case('http://doi.wiley.com/10.1002/tdm_license_1.1', 'WILEY-TDM-1.1'), - Case('http://homepage.data-planet.com/terms-use', 'SAGE-DATA-PLANET'), - Case('http://www.springer.com/tdm', 'SPRINGER-TDM'), - Case('https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess.xhtml', 'ADS-UK'), - Case('https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess', 'ADS-UK'), - Case('https://creativecommons.org/public-domain/cc0', 'CC-0'), - Case('https://creativecommons.org/publicdomain/zero/1.0', 'CC-0'), - Case('https://creativecommons.org/share-your-work/public-domain/cc0', 'CC-0'), - Case('https://www.elsevier.com/tdm/userlicense/1.0', 'ELSEVIER-USER-1.0'), - Case('https://www.gnu.org/licenses/gpl-3.0.html', 'GPL-3.0'), - Case('http://rightsstatements.org/page/InC/1.0?language=en', 'RS-INC'), - Case('http://onlinelibrary.wiley.com/termsAndConditions', 'WILEY'), - Case('https://publikationen.bibliothek.kit.edu/kitopen-lizenz', 'KIT-OPEN'), - Case('http://journals.sagepub.com/page/policies/text-and-data-mining-license', 'SAGE-TDM'), - Case('https://creativecommons.org/publicdomain/mark/1.0/deed.de', 'CC-PUBLICDOMAIN'), - Case('http://creativecommons.org/publicdomain/mark/1.0', 'CC-PUBLICDOMAIN'), - Case('https://creativecommons.org/publicdomain/mark/1.0', 'CC-PUBLICDOMAIN'), - Case('https://creativecommons.org/publicdomain/mark/1.0/', 'CC-PUBLICDOMAIN'), - Case('https://creativecommons.org/publicdomain/mark/1.0/deed.de', 'CC-PUBLICDOMAIN'), - Case('https://creativecommons.org/share-your-work/public-domain/cc0/', 'CC-0'), - Case('http://spdx.org/licenses/CC0-1.0.json', 'CC-0'), - Case('http://spdx.org/licenses/CC-BY-1.0.json', 'CC-BY'), - Case('http://spdx.org/licenses/CC-BY-4.0.json', 'CC-BY'), - Case('http://spdx.org/licenses/CC-BY-NC-4.0.json', 'CC-BY-NC'), - Case('http://spdx.org/licenses/CC-BY-SA-3.0.json', 'CC-BY-SA'), - Case('http://spdx.org/licenses/CC-BY-SA-4.0.json', 'CC-BY-SA'), - Case('http://spdx.org/licenses/MIT.json', 'MIT'), - Case('http://spdx.org/licenses/OGL-Canada-2.0.json', 'OGL-CANADA'), + Case("https://opensource.org/licenses/MIT", "MIT"), + Case("creativecommons.org/licenses/by-nc-nd/3.0/", "CC-BY-NC-ND"), + Case("http://creativecommons.org/licences/by-nc-sa/4.0", "CC-BY-NC-SA"), + Case("http://creativecommons.org/licenses/by-nc-nd/2.5/co", "CC-BY-NC-ND"), + Case("http://creativecommons.org/licenses/by-nd/4.0/legalcode", "CC-BY-ND"), + Case("http://creativecommons.org/licenses/by/2.0/uk/legalcode", "CC-BY"), + Case("http://creativecommons.org/publicdomain/zero/1.0/legalcode", "CC-0"), + Case("http://doi.wiley.com/10.1002/tdm_license_1.1", "WILEY-TDM-1.1"), + Case("http://homepage.data-planet.com/terms-use", "SAGE-DATA-PLANET"), + Case("http://www.springer.com/tdm", "SPRINGER-TDM"), + Case( + "https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess.xhtml", + "ADS-UK", + ), + Case( + "https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess", "ADS-UK" + ), + Case("https://creativecommons.org/public-domain/cc0", "CC-0"), + Case("https://creativecommons.org/publicdomain/zero/1.0", "CC-0"), + Case("https://creativecommons.org/share-your-work/public-domain/cc0", "CC-0"), + Case("https://www.elsevier.com/tdm/userlicense/1.0", "ELSEVIER-USER-1.0"), + Case("https://www.gnu.org/licenses/gpl-3.0.html", "GPL-3.0"), + Case("http://rightsstatements.org/page/InC/1.0?language=en", "RS-INC"), + Case("http://onlinelibrary.wiley.com/termsAndConditions", "WILEY"), + Case("https://publikationen.bibliothek.kit.edu/kitopen-lizenz", "KIT-OPEN"), + Case( + "http://journals.sagepub.com/page/policies/text-and-data-mining-license", + "SAGE-TDM", + ), + Case( + "https://creativecommons.org/publicdomain/mark/1.0/deed.de", + "CC-PUBLICDOMAIN", + ), + Case("http://creativecommons.org/publicdomain/mark/1.0", "CC-PUBLICDOMAIN"), + Case("https://creativecommons.org/publicdomain/mark/1.0", "CC-PUBLICDOMAIN"), + Case("https://creativecommons.org/publicdomain/mark/1.0/", "CC-PUBLICDOMAIN"), + Case( + "https://creativecommons.org/publicdomain/mark/1.0/deed.de", + "CC-PUBLICDOMAIN", + ), + Case("https://creativecommons.org/share-your-work/public-domain/cc0/", "CC-0"), + Case("http://spdx.org/licenses/CC0-1.0.json", "CC-0"), + Case("http://spdx.org/licenses/CC-BY-1.0.json", "CC-BY"), + Case("http://spdx.org/licenses/CC-BY-4.0.json", "CC-BY"), + Case("http://spdx.org/licenses/CC-BY-NC-4.0.json", "CC-BY-NC"), + Case("http://spdx.org/licenses/CC-BY-SA-3.0.json", "CC-BY-SA"), + Case("http://spdx.org/licenses/CC-BY-SA-4.0.json", "CC-BY-SA"), + Case("http://spdx.org/licenses/MIT.json", "MIT"), + Case("http://spdx.org/licenses/OGL-Canada-2.0.json", "OGL-CANADA"), ] for c in cases: got = lookup_license_slug(c.input) - assert c.output == got, '{}: got {}, want {}'.format(c.input, got, c.output) + assert c.output == got, "{}: got {}, want {}".format(c.input, got, c.output) + + +def test_contributor_list_contains_contributor(): + Case = collections.namedtuple("Case", "contrib_list contrib want") + cases = [ + Case([], fatcat_openapi_client.ReleaseContrib(raw_name="Paul Katz"), False), + ] + for c in cases: + got = contributor_list_contains_contributor(c.contrib_list, c.contrib) + assert got == c.want -- cgit v1.2.3 From 2411bad315b48b99c19958ea3c393dc4d09d6486 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 10 Jul 2020 18:29:00 +0200 Subject: datacite: document contributor types --- python/fatcat_tools/importers/datacite.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 7797812f..797ccf19 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -296,6 +296,31 @@ class DataciteImporter(EntityImporter): contributors = attributes.get('contributors', []) or [] # Much fewer than creators. contribs = self.parse_datacite_creators(creators, doi=doi) + + # Beside creators, we have contributors in datacite. Sample: + # ContactPerson, DataCollector, DataCurator, DataManager, Distributor, + # Editor, Funder, HostingInstitution, Other, Producer, ProjectLeader, + # ProjectMember, RelatedPerson, ResearchGroup, Researcher, + # RightsHolder, Sponsor, Supervisor + # + # Datacite schema: + # https://schema.datacite.org/meta/kernel-4.3/doc/DataCite-MetadataKernel_v4.3.pdf#page=32 + # -- could be used as a form of controlled vocab? + # + # Currently (07/2020) in release_contrib: + # + # select count(*), role from release_contrib group by role; + # count | role + # -----------+------------ + # 500269665 | author + # 4386563 | editor + # 17871 | translator + # 10870584 | + # (4 rows) + # + # Related: https://guide.fatcat.wiki/entity_release.html -- role + # (string, of a set): the type of contribution, from a controlled + # vocabulary. TODO: vocabulary needs review. contribs_extra_contributors = self.parse_datacite_creators(contributors, set_index=False, doi=doi) # Unfortunately, creators and contributors might overlap, refs GH59. -- cgit v1.2.3 From d2bcd77f73c6496a2ffdd865d2348f33f4fb17f1 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 10 Jul 2020 18:29:31 +0200 Subject: datacite: there should be no index gaps --- python/fatcat_tools/importers/datacite.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 797ccf19..962d80c6 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -758,9 +758,10 @@ class DataciteImporter(EntityImporter): # Names, that should be ignored right away. name_blacklist = set(('Occdownload Gbif.Org',)) - for i, c in enumerate(creators): + i = 0 + for c in creators: if not set_index: - i = None + i = None nameType = c.get('nameType', '') or '' if nameType in ('', 'Personal'): creator_id = None @@ -838,8 +839,11 @@ class DataciteImporter(EntityImporter): raw_affiliation=raw_affiliation, extra=extra, ) + # Filter out duplicates early. if not contributor_list_contains_contributor(contribs, rc): contribs.append(rc) + if i is not None: + i += 1 elif nameType == 'Organizational': name = c.get('name', '') or '' if name in UNKNOWN_MARKERS: @@ -849,6 +853,8 @@ class DataciteImporter(EntityImporter): extra = {'organization': name} contribs.append(fatcat_openapi_client.ReleaseContrib( index=i, extra=extra)) + if i is not None: + i += 1 else: print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr) -- cgit v1.2.3 From fdf1028c19b0623e30b91e49ffa65ed130dcfdc1 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 10 Jul 2020 18:29:47 +0200 Subject: datacite: adjust tests --- python/tests/files/datacite/datacite_result_27.json | 3 ++- python/tests/files/datacite/datacite_result_28.json | 3 ++- python/tests/files/datacite/datacite_result_29.json | 3 ++- python/tests/files/datacite/datacite_result_34.json | 7 ------- 4 files changed, 6 insertions(+), 10 deletions(-) diff --git a/python/tests/files/datacite/datacite_result_27.json b/python/tests/files/datacite/datacite_result_27.json index 3d033e6a..e934fb41 100644 --- a/python/tests/files/datacite/datacite_result_27.json +++ b/python/tests/files/datacite/datacite_result_27.json @@ -13,7 +13,8 @@ }, "given_name": "David", "raw_name": "David Wemmer", - "surname": "Wemmer" + "surname": "Wemmer", + "role": "author" } ], "ext_ids": { diff --git a/python/tests/files/datacite/datacite_result_28.json b/python/tests/files/datacite/datacite_result_28.json index 84bed9c8..bcb1caaf 100644 --- a/python/tests/files/datacite/datacite_result_28.json +++ b/python/tests/files/datacite/datacite_result_28.json @@ -13,7 +13,8 @@ }, "given_name": "David", "raw_name": "David Wemmer", - "surname": "Wemmer" + "surname": "Wemmer", + "role": "author" } ], "ext_ids": { diff --git a/python/tests/files/datacite/datacite_result_29.json b/python/tests/files/datacite/datacite_result_29.json index 84bed9c8..bcb1caaf 100644 --- a/python/tests/files/datacite/datacite_result_29.json +++ b/python/tests/files/datacite/datacite_result_29.json @@ -13,7 +13,8 @@ }, "given_name": "David", "raw_name": "David Wemmer", - "surname": "Wemmer" + "surname": "Wemmer", + "role": "author" } ], "ext_ids": { diff --git a/python/tests/files/datacite/datacite_result_34.json b/python/tests/files/datacite/datacite_result_34.json index 8e087ab5..4a52e22c 100644 --- a/python/tests/files/datacite/datacite_result_34.json +++ b/python/tests/files/datacite/datacite_result_34.json @@ -12,13 +12,6 @@ "index": 0, "raw_name": "Paul Katz", "role": "author" - }, - { - "given_name": "", - "surname": "", - "index": 0, - "raw_name": "Paul Katz", - "role": "illustrator" } ], "ext_ids": { -- cgit v1.2.3