diff options
author | bnewbold <bnewbold@archive.org> | 2020-07-11 00:31:47 +0000 |
---|---|---|
committer | bnewbold <bnewbold@archive.org> | 2020-07-11 00:31:47 +0000 |
commit | f5aefab6a6431ab9db99761457fd47b36b920b8c (patch) | |
tree | d144988d310aeecf8521cfc33aca9f0667dfedbc | |
parent | 26b455ffad566bef58684a78654a2719c409588a (diff) | |
parent | 3c266e07771271241aa8cff3e3199a45109362af (diff) | |
download | fatcat-f5aefab6a6431ab9db99761457fd47b36b920b8c.tar.gz fatcat-f5aefab6a6431ab9db99761457fd47b36b920b8c.zip |
Merge branch 'martin-datacite-duplicated-author-gh-59' into 'master'
datacite: address duplicated contributor issue
See merge request webgroup/fatcat!65
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 66 | ||||
-rw-r--r-- | python/tests/files/datacite/datacite_doc_33.json | 62 | ||||
-rw-r--r-- | python/tests/files/datacite/datacite_doc_34.json | 61 | ||||
-rw-r--r-- | python/tests/files/datacite/datacite_result_05.json | 6 | ||||
-rw-r--r-- | python/tests/files/datacite/datacite_result_08.json | 7 | ||||
-rw-r--r-- | python/tests/files/datacite/datacite_result_09.json | 3 | ||||
-rw-r--r-- | python/tests/files/datacite/datacite_result_26.json | 3 | ||||
-rw-r--r-- | python/tests/files/datacite/datacite_result_27.json | 3 | ||||
-rw-r--r-- | python/tests/files/datacite/datacite_result_28.json | 3 | ||||
-rw-r--r-- | python/tests/files/datacite/datacite_result_29.json | 3 | ||||
-rw-r--r-- | python/tests/files/datacite/datacite_result_33.json | 31 | ||||
-rw-r--r-- | python/tests/files/datacite/datacite_result_34.json | 31 | ||||
-rw-r--r-- | python/tests/import_datacite.py | 591 |
13 files changed, 619 insertions, 251 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 785107ee..ebb29feb 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -294,7 +294,39 @@ class DataciteImporter(EntityImporter): creators = attributes.get('creators', []) or [] contributors = attributes.get('contributors', []) or [] # Much fewer than creators. - contribs = self.parse_datacite_creators(creators, doi=doi) + self.parse_datacite_creators(contributors, role=None, set_index=False, doi=doi) + contribs = self.parse_datacite_creators(creators, doi=doi) + + # Beside creators, we have contributors in datacite. Sample: + # ContactPerson, DataCollector, DataCurator, DataManager, Distributor, + # Editor, Funder, HostingInstitution, Other, Producer, ProjectLeader, + # ProjectMember, RelatedPerson, ResearchGroup, Researcher, + # RightsHolder, Sponsor, Supervisor + # + # Datacite schema: + # https://schema.datacite.org/meta/kernel-4.3/doc/DataCite-MetadataKernel_v4.3.pdf#page=32 + # -- could be used as a form of controlled vocab? + # + # Currently (07/2020) in release_contrib: + # + # select count(*), role from release_contrib group by role; + # count | role + # -----------+------------ + # 500269665 | author + # 4386563 | editor + # 17871 | translator + # 10870584 | + # (4 rows) + # + # Related: https://guide.fatcat.wiki/entity_release.html -- role + # (string, of a set): the type of contribution, from a controlled + # vocabulary. TODO: vocabulary needs review. + contribs_extra_contributors = self.parse_datacite_creators(contributors, set_index=False, doi=doi) + + # Unfortunately, creators and contributors might overlap, refs GH59. + for cc in contribs_extra_contributors: + if contributor_list_contains_contributor(contribs, cc): + continue + contribs.append(cc) # Title, may come with "attributes.titles[].titleType", like # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle" @@ -725,9 +757,10 @@ class DataciteImporter(EntityImporter): # Names, that should be ignored right away. name_blacklist = set(('Occdownload Gbif.Org',)) - for i, c in enumerate(creators): + i = 0 + for c in creators: if not set_index: - i = None + i = None nameType = c.get('nameType', '') or '' if nameType in ('', 'Personal'): creator_id = None @@ -799,8 +832,7 @@ class DataciteImporter(EntityImporter): if contributorType: extra = {'type': contributorType} - contribs.append( - fatcat_openapi_client.ReleaseContrib( + rc = fatcat_openapi_client.ReleaseContrib( creator_id=creator_id, index=i, raw_name=name, @@ -809,7 +841,12 @@ class DataciteImporter(EntityImporter): role=role, raw_affiliation=raw_affiliation, extra=extra, - )) + ) + # Filter out duplicates early. + if not contributor_list_contains_contributor(contribs, rc): + contribs.append(rc) + if i is not None: + i += 1 elif nameType == 'Organizational': name = c.get('name', '') or '' if name in UNKNOWN_MARKERS: @@ -819,12 +856,29 @@ class DataciteImporter(EntityImporter): extra = {'organization': name} contribs.append(fatcat_openapi_client.ReleaseContrib( index=i, extra=extra)) + if i is not None: + i += 1 else: print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr) return contribs +def contributor_list_contains_contributor(contributor_list, contributor): + """ + Given a list of contributors, determine, whether contrib is in that list. + """ + for cc in contributor_list: + if cc.raw_name != contributor.raw_name: + continue + cc_role = cc.role or 'author' + contributor_role = contributor.role or 'author' + if cc_role != contributor_role: + continue + return True + return False + + def lookup_license_slug(raw): """ Resolve a variety of strings into a some pseudo-canonical form, e.g. diff --git a/python/tests/files/datacite/datacite_doc_33.json b/python/tests/files/datacite/datacite_doc_33.json new file mode 100644 index 00000000..571d1220 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_33.json @@ -0,0 +1,62 @@ +{ + "id": "10.17912/micropub.biology.000143", + "type": "dois", + "attributes": { + "doi": "10.17912/micropub.biology.000143", + "identifiers": null, + "creators": [ + { + "name": "ABC News", + "givenName": "", + "familyName": "", + "affiliation": [], + "role": "author" + } + ], + "titles": [ + { + "title": "Sample" + } + ], + "publisher": "microPublication Biology", + "publicationYear": 2019, + "types": { + "resourceTypeGeneral": "DataPaper" + }, + "relatedIdentifiers": [], + "sizes": [], + "formats": [], + "version": null, + "rightsList": [], + "descriptions": [ + { + "description": 1234567890, + "descriptionType": "Abstract" + } + ], + "geoLocations": [], + "fundingReferences": [], + "url": "https://www.micropublication.org/journals/biology/micropub.biology.000143", + "created": "2019-08-19T14:43:08.000Z", + "registered": "2019-08-19T14:43:09.000Z", + "published": "2019", + "updated": "2019-11-09T12:32:02.000Z", + "contributors": [ + { + "name": "ABC News", + "givenName": "", + "familyName": "", + "affiliation": [], + "role": "" + } + ] + }, + "relationships": { + "client": { + "data": { + "id": "caltech.micropub", + "type": "clients" + } + } + } +} diff --git a/python/tests/files/datacite/datacite_doc_34.json b/python/tests/files/datacite/datacite_doc_34.json new file mode 100644 index 00000000..5dcf65f4 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_34.json @@ -0,0 +1,61 @@ +{ + "id": "10.17912/micropub.biology.000143", + "type": "dois", + "attributes": { + "doi": "10.17912/micropub.biology.000143", + "identifiers": null, + "creators": [ + { + "name": "Paul Katz", + "givenName": "", + "familyName": "", + "affiliation": [], + "role": "author" + } + ], + "titles": [ + { + "title": "Sample" + } + ], + "publisher": "microPublication Biology", + "publicationYear": 2019, + "types": { + "resourceTypeGeneral": "DataPaper" + }, + "relatedIdentifiers": [], + "sizes": [], + "formats": [], + "version": null, + "rightsList": [], + "descriptions": [ + { + "description": 1234567890, + "descriptionType": "Abstract" + } + ], + "geoLocations": [], + "fundingReferences": [], + "url": "https://www.micropublication.org/journals/biology/micropub.biology.000143", + "created": "2019-08-19T14:43:08.000Z", + "registered": "2019-08-19T14:43:09.000Z", + "published": "2019", + "updated": "2019-11-09T12:32:02.000Z", + "contributors": [ + { + "name": "Paul Katz", + "givenName": "", + "familyName": "", + "affiliation": [], + "role": "illustrator" + } ] + }, + "relationships": { + "client": { + "data": { + "id": "caltech.micropub", + "type": "clients" + } + } + } +} diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json index 79c2a8fb..c91f3a7f 100644 --- a/python/tests/files/datacite/datacite_result_05.json +++ b/python/tests/files/datacite/datacite_result_05.json @@ -505,10 +505,8 @@ "surname": "Wurzbacher" }, { - "raw_name": "Kessy Abarenkov" - }, - { - "raw_name": "NHM UT-University Of Tartu; Natural History Museum And Botanic Garden" + "raw_name": "NHM UT-University Of Tartu; Natural History Museum And Botanic Garden", + "role": "author" } ], "ext_ids": { diff --git a/python/tests/files/datacite/datacite_result_08.json b/python/tests/files/datacite/datacite_result_08.json index 70237280..5a46ef50 100644 --- a/python/tests/files/datacite/datacite_result_08.json +++ b/python/tests/files/datacite/datacite_result_08.json @@ -13,13 +13,6 @@ "raw_name": "Kei Kajisa", "role": "author", "surname": "Kajisa" - }, - { - "given_name": "Kei", - "index": 1, - "raw_name": "Kei Kajisa", - "role": "author", - "surname": "Kajisa" } ], "ext_ids": { diff --git a/python/tests/files/datacite/datacite_result_09.json b/python/tests/files/datacite/datacite_result_09.json index 09e02fc7..f6ec524a 100644 --- a/python/tests/files/datacite/datacite_result_09.json +++ b/python/tests/files/datacite/datacite_result_09.json @@ -17,7 +17,8 @@ "extra": { "type": "DataManager" }, - "raw_name": "Technische Informationsbibliothek (TIB)" + "raw_name": "Technische Informationsbibliothek (TIB)", + "role": "author" } ], "ext_ids": { diff --git a/python/tests/files/datacite/datacite_result_26.json b/python/tests/files/datacite/datacite_result_26.json index 267eb9c2..f6e589ef 100644 --- a/python/tests/files/datacite/datacite_result_26.json +++ b/python/tests/files/datacite/datacite_result_26.json @@ -13,7 +13,8 @@ }, "given_name": "David", "raw_name": "David Wemmer", - "surname": "Wemmer" + "surname": "Wemmer", + "role": "author" } ], "ext_ids": { diff --git a/python/tests/files/datacite/datacite_result_27.json b/python/tests/files/datacite/datacite_result_27.json index 3d033e6a..e934fb41 100644 --- a/python/tests/files/datacite/datacite_result_27.json +++ b/python/tests/files/datacite/datacite_result_27.json @@ -13,7 +13,8 @@ }, "given_name": "David", "raw_name": "David Wemmer", - "surname": "Wemmer" + "surname": "Wemmer", + "role": "author" } ], "ext_ids": { diff --git a/python/tests/files/datacite/datacite_result_28.json b/python/tests/files/datacite/datacite_result_28.json index 84bed9c8..bcb1caaf 100644 --- a/python/tests/files/datacite/datacite_result_28.json +++ b/python/tests/files/datacite/datacite_result_28.json @@ -13,7 +13,8 @@ }, "given_name": "David", "raw_name": "David Wemmer", - "surname": "Wemmer" + "surname": "Wemmer", + "role": "author" } ], "ext_ids": { diff --git a/python/tests/files/datacite/datacite_result_29.json b/python/tests/files/datacite/datacite_result_29.json index 84bed9c8..bcb1caaf 100644 --- a/python/tests/files/datacite/datacite_result_29.json +++ b/python/tests/files/datacite/datacite_result_29.json @@ -13,7 +13,8 @@ }, "given_name": "David", "raw_name": "David Wemmer", - "surname": "Wemmer" + "surname": "Wemmer", + "role": "author" } ], "ext_ids": { diff --git a/python/tests/files/datacite/datacite_result_33.json b/python/tests/files/datacite/datacite_result_33.json new file mode 100644 index 00000000..bcb72469 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_33.json @@ -0,0 +1,31 @@ +{ + "abstracts": [ + { + "content": "1234567890", + "mimetype": "text/plain" + } + ], + "contribs": [ + { + "given_name": "", + "surname": "", + "index": 0, + "raw_name": "ABC News", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.17912/micropub.biology.000143" + }, + "extra": { + "datacite": { + "resourceTypeGeneral": "DataPaper" + }, + "container_name": "microPublication Biology" + }, + "refs": [], + "release_stage": "published", + "release_year": 2019, + "publisher": "microPublication Biology", + "title": "Sample" +} diff --git a/python/tests/files/datacite/datacite_result_34.json b/python/tests/files/datacite/datacite_result_34.json new file mode 100644 index 00000000..4a52e22c --- /dev/null +++ b/python/tests/files/datacite/datacite_result_34.json @@ -0,0 +1,31 @@ +{ + "abstracts": [ + { + "content": "1234567890", + "mimetype": "text/plain" + } + ], + "contribs": [ + { + "given_name": "", + "surname": "", + "index": 0, + "raw_name": "Paul Katz", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.17912/micropub.biology.000143" + }, + "extra": { + "datacite": { + "resourceTypeGeneral": "DataPaper" + }, + "container_name": "microPublication Biology" + }, + "refs": [], + "release_stage": "published", + "release_year": 2019, + "publisher": "microPublication Biology", + "title": "Sample" +} diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 8fb2d079..b94b6bc5 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -10,32 +10,54 @@ import collections import pytest from fatcat_tools.importers import DataciteImporter, JsonLinePusher -from fatcat_tools.importers.datacite import find_original_language_title, parse_datacite_titles, parse_datacite_dates, index_form_to_display_name, lookup_license_slug +from fatcat_tools.importers.datacite import ( + find_original_language_title, + parse_datacite_titles, + parse_datacite_dates, + clean_doi, + index_form_to_display_name, + lookup_license_slug, + contributor_list_contains_contributor, +) from fatcat_tools.transforms import entity_to_dict -from fixtures import * +import fatcat_openapi_client +from fixtures import api +import json @pytest.fixture(scope="function") def datacite_importer(api): - with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: - yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', - bezerk_mode=True) + with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: + yield DataciteImporter( + api, + issn_file, + extid_map_file="tests/files/example_map.sqlite3", + bezerk_mode=True, + ) + @pytest.fixture(scope="function") def datacite_importer_existing(api): - with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: - yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3', - bezerk_mode=False) + with open("tests/files/ISSN-to-ISSN-L.snip.txt", "r") as issn_file: + yield DataciteImporter( + api, + issn_file, + extid_map_file="tests/files/example_map.sqlite3", + bezerk_mode=False, + ) + @pytest.mark.skip(reason="larger datacite import slows tests down") def test_datacite_importer_huge(datacite_importer): last_index = datacite_importer.api.get_changelog(limit=1)[0].index - with gzip.open('tests/files/datacite_1k_records.jsonl.gz', 'rt') as f: + with gzip.open("tests/files/datacite_1k_records.jsonl.gz", "rt") as f: datacite_importer.bezerk_mode = True counts = JsonLinePusher(datacite_importer, f).run() - assert counts['insert'] == 998 - change = datacite_importer.api.get_changelog_entry(index=last_index+1) - release = datacite_importer.api.get_release(change.editgroup.edits.releases[0].ident) + assert counts["insert"] == 998 + change = datacite_importer.api.get_changelog_entry(index=last_index + 1) + release = datacite_importer.api.get_release( + change.editgroup.edits.releases[0].ident + ) assert len(release.contribs) == 3 @@ -43,122 +65,161 @@ def test_find_original_language_title(): """ Original language might be included, in various ways. """ - Case = collections.namedtuple('Case', 'about input result') + Case = collections.namedtuple("Case", "about input result") cases = [ - Case('defaults to None', {}, None), - Case('ignore unknown keys', {'broken': 'kv'}, None), - Case('just a title', {'title': 'Noise Reduction'}, None), - Case('same title should be ignored', { - 'title': 'Noise Reduction', - 'original_language_title': 'Noise Reduction' - }, None), - Case('empty subdict is ignored', { - 'title': 'Noise Reduction', - 'original_language_title': {}, - }, None), - Case('unknown subdict keys are ignored', { - 'title': 'Noise Reduction', - 'original_language_title': {'broken': 'kv'}, - }, None), - Case('original string', { - 'title': 'Noise Reduction', - 'original_language_title': 'Подавление шума', - }, 'Подавление шума'), - Case('language tag is ignored, since its broken', { - 'title': 'Noise Reduction', - 'original_language_title': { - 'language': 'ja', - '__content__': 'Noise Reduction' + Case("defaults to None", {}, None), + Case("ignore unknown keys", {"broken": "kv"}, None), + Case("just a title", {"title": "Noise Reduction"}, None), + Case( + "same title should be ignored", + {"title": "Noise Reduction", "original_language_title": "Noise Reduction"}, + None, + ), + Case( + "empty subdict is ignored", + {"title": "Noise Reduction", "original_language_title": {},}, + None, + ), + Case( + "unknown subdict keys are ignored", + {"title": "Noise Reduction", "original_language_title": {"broken": "kv"},}, + None, + ), + Case( + "original string", + {"title": "Noise Reduction", "original_language_title": "Подавление шума",}, + "Подавление шума", + ), + Case( + "language tag is ignored, since its broken", + { + "title": "Noise Reduction", + "original_language_title": { + "language": "ja", + "__content__": "Noise Reduction", + }, }, - }, None), - Case('do not care about language', { - 'title': 'Noise Reduction', - 'original_language_title': { - 'language': 'ja', - '__content__': 'Rauschunterdrückung', + None, + ), + Case( + "do not care about language", + { + "title": "Noise Reduction", + "original_language_title": { + "language": "ja", + "__content__": "Rauschunterdrückung", + }, }, - }, 'Rauschunterdrückung'), - Case('ignore excessive questionmarks', { - 'title': 'Noise Reduction', - 'original_language_title': { - 'language': 'ja', - '__content__': '???? However', + "Rauschunterdrückung", + ), + Case( + "ignore excessive questionmarks", + { + "title": "Noise Reduction", + "original_language_title": { + "language": "ja", + "__content__": "???? However", + }, }, - }, None), + None, + ), ] for case in cases: result = find_original_language_title(case.input) assert result == case.result + def test_parse_datacite_titles(): """ Given a list of titles, find title, original_language_title and subtitle. Result is a 3-tuple of title, original_language_title, subtitle. """ - Case = collections.namedtuple('Case', 'about input result') + Case = collections.namedtuple("Case", "about input result") cases = [ - Case('handle None', None, (None, None, None)), - Case('empty list', [], (None, None, None)), - Case('empty item', [{}], (None, None, None)), - Case('broken keys', [{'broken': 'kv'}], (None, None, None)), - Case('title only', [{'title': 'Total carbon dioxide'}], - ('Total carbon dioxide', None, None), - ), - Case('title and subtitle', [ - {'title': 'Total carbon dioxide'}, - {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, - ], - ('Total carbon dioxide', None, 'Station TT043_7-9'), - ), - Case('title, subtitle order does not matter', [ - {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, - {'title': 'Total carbon dioxide'}, - ], - ('Total carbon dioxide', None, 'Station TT043_7-9'), - ), - Case('multiple titles, first wins', [ - {'title': 'Total carbon dioxide'}, - {'title': 'Meeting Heterogeneity'}, - ], - ('Total carbon dioxide', None, None), - ), - Case('multiple titles, plus sub', [ - {'title': 'Total carbon dioxide'}, - {'title': 'Meeting Heterogeneity'}, - {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, - ], - ('Total carbon dioxide', None, 'Station TT043_7-9'), - ), - Case('multiple titles, multiple subs', [ - {'title': 'Total carbon dioxide'}, - {'title': 'Meeting Heterogeneity'}, - {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, - {'title': 'Some other subtitle', 'titleType': 'Subtitle'}, - ], - ('Total carbon dioxide', None, 'Station TT043_7-9'), - ), - Case('title, original, sub', [ - {'title': 'Total carbon dioxide', 'original_language_title': 'Всего углекислого газа'}, - {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, - ], - ('Total carbon dioxide', 'Всего углекислого газа', 'Station TT043_7-9'), - ), - Case('title, original same as title, sub', [ - {'title': 'Total carbon dioxide', 'original_language_title': { - '__content__': 'Total carbon dioxide', - }}, - {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, - ], - ('Total carbon dioxide', None, 'Station TT043_7-9'), - ), - Case('title, original dict, sub', [ - {'title': 'Total carbon dioxide', 'original_language_title': { - '__content__': 'Всего углекислого газа', - }}, - {'title': 'Station TT043_7-9', 'titleType': 'Subtitle'}, - ], - ('Total carbon dioxide', 'Всего углекислого газа', 'Station TT043_7-9'), + Case("handle None", None, (None, None, None)), + Case("empty list", [], (None, None, None)), + Case("empty item", [{}], (None, None, None)), + Case("broken keys", [{"broken": "kv"}], (None, None, None)), + Case( + "title only", + [{"title": "Total carbon dioxide"}], + ("Total carbon dioxide", None, None), + ), + Case( + "title and subtitle", + [ + {"title": "Total carbon dioxide"}, + {"title": "Station TT043_7-9", "titleType": "Subtitle"}, + ], + ("Total carbon dioxide", None, "Station TT043_7-9"), + ), + Case( + "title, subtitle order does not matter", + [ + {"title": "Station TT043_7-9", "titleType": "Subtitle"}, + {"title": "Total carbon dioxide"}, + ], + ("Total carbon dioxide", None, "Station TT043_7-9"), + ), + Case( + "multiple titles, first wins", + [{"title": "Total carbon dioxide"}, {"title": "Meeting Heterogeneity"},], + ("Total carbon dioxide", None, None), + ), + Case( + "multiple titles, plus sub", + [ + {"title": "Total carbon dioxide"}, + {"title": "Meeting Heterogeneity"}, + {"title": "Station TT043_7-9", "titleType": "Subtitle"}, + ], + ("Total carbon dioxide", None, "Station TT043_7-9"), + ), + Case( + "multiple titles, multiple subs", + [ + {"title": "Total carbon dioxide"}, + {"title": "Meeting Heterogeneity"}, + {"title": "Station TT043_7-9", "titleType": "Subtitle"}, + {"title": "Some other subtitle", "titleType": "Subtitle"}, + ], + ("Total carbon dioxide", None, "Station TT043_7-9"), + ), + Case( + "title, original, sub", + [ + { + "title": "Total carbon dioxide", + "original_language_title": "Всего углекислого газа", + }, + {"title": "Station TT043_7-9", "titleType": "Subtitle"}, + ], + ("Total carbon dioxide", "Всего углекислого газа", "Station TT043_7-9"), + ), + Case( + "title, original same as title, sub", + [ + { + "title": "Total carbon dioxide", + "original_language_title": {"__content__": "Total carbon dioxide",}, + }, + {"title": "Station TT043_7-9", "titleType": "Subtitle"}, + ], + ("Total carbon dioxide", None, "Station TT043_7-9"), + ), + Case( + "title, original dict, sub", + [ + { + "title": "Total carbon dioxide", + "original_language_title": { + "__content__": "Всего углекислого газа", + }, + }, + {"title": "Station TT043_7-9", "titleType": "Subtitle"}, + ], + ("Total carbon dioxide", "Всего углекислого газа", "Station TT043_7-9"), ), ] @@ -166,91 +227,128 @@ def test_parse_datacite_titles(): result = parse_datacite_titles(case.input) assert result == case.result, case.about + def test_parse_datacite_dates(): """ Test datacite date parsing. """ - Case = collections.namedtuple('Case', 'about input result') + Case = collections.namedtuple("Case", "about input result") cases = [ - Case('None is None', None, (None, None, None)), - Case('empty list is None', [], (None, None, None)), - Case('empty item is None', [{}], (None, None, None)), - Case('year only yields year only', [{'date': '2019'}], (None, None, 2019)), - Case('int year', [{'date': 2019}], (None, None, 2019)), - Case('first wins', [{'date': '2019'}, {'date': '2020'}], (None, None, 2019)), - Case('skip bogus year', [{'date': 'abc'}, {'date': '2020'}], (None, None, 2020)), - Case('first with type', [ - {'date': '2019', 'dateType': 'Accepted'}, {'date': '2020'} - ], (None, None, 2019)), - Case('full date', [ - {'date': '2019-12-01', 'dateType': 'Valid'}, - ], (datetime.date(2019, 12, 1), 12, 2019)), - Case('date type prio', [ - {'date': '2000-12-01', 'dateType': 'Valid'}, - {'date': '2010-01-01', 'dateType': 'Updated'}, - ], (datetime.date(2000, 12, 1), 12, 2000)), - Case('date type prio, Available > Updated', [ - {'date': '2010-01-01', 'dateType': 'Updated'}, - {'date': '2000-12-01', 'dateType': 'Available'}, - ], (datetime.date(2000, 12, 1), 12, 2000)), - Case('allow different date formats, Available > Updated', [ - {'date': '2010-01-01T10:00:00', 'dateType': 'Updated'}, - {'date': '2000-12-01T10:00:00', 'dateType': 'Available'}, - ], (datetime.date(2000, 12, 1), 12, 2000)), - Case('allow different date formats, Available > Updated', [ - {'date': '2010-01-01T10:00:00Z', 'dateType': 'Updated'}, - {'date': '2000-12-01T10:00:00Z', 'dateType': 'Available'}, - ], (datetime.date(2000, 12, 1), 12, 2000)), - Case('allow fuzzy date formats, Available > Updated', [ - {'date': '2010', 'dateType': 'Updated'}, - {'date': '2000 Dec 01', 'dateType': 'Available'}, - ], (datetime.date(2000, 12, 1), 12, 2000)), - Case('fuzzy year only', [ - {'date': 'Year 2010', 'dateType': 'Issued'}, - ], (None, None, 2010)), - Case('fuzzy year and month', [ - {'date': 'Year 2010 Feb', 'dateType': 'Issued'}, - ], (None, 2, 2010)), - Case('fuzzy year, month, day', [ - {'date': 'Year 2010 Feb 24', 'dateType': 'Issued'}, - ], (datetime.date(2010, 2, 24), 2, 2010)), - Case('ignore broken date', [ - {'date': 'Febrrr 45', 'dateType': 'Updated'}, - ], (None, None, None)), + Case("None is None", None, (None, None, None)), + Case("empty list is None", [], (None, None, None)), + Case("empty item is None", [{}], (None, None, None)), + Case("year only yields year only", [{"date": "2019"}], (None, None, 2019)), + Case("int year", [{"date": 2019}], (None, None, 2019)), + Case("first wins", [{"date": "2019"}, {"date": "2020"}], (None, None, 2019)), + Case( + "skip bogus year", [{"date": "abc"}, {"date": "2020"}], (None, None, 2020) + ), + Case( + "first with type", + [{"date": "2019", "dateType": "Accepted"}, {"date": "2020"}], + (None, None, 2019), + ), + Case( + "full date", + [{"date": "2019-12-01", "dateType": "Valid"},], + (datetime.date(2019, 12, 1), 12, 2019), + ), + Case( + "date type prio", + [ + {"date": "2000-12-01", "dateType": "Valid"}, + {"date": "2010-01-01", "dateType": "Updated"}, + ], + (datetime.date(2000, 12, 1), 12, 2000), + ), + Case( + "date type prio, Available > Updated", + [ + {"date": "2010-01-01", "dateType": "Updated"}, + {"date": "2000-12-01", "dateType": "Available"}, + ], + (datetime.date(2000, 12, 1), 12, 2000), + ), + Case( + "allow different date formats, Available > Updated", + [ + {"date": "2010-01-01T10:00:00", "dateType": "Updated"}, + {"date": "2000-12-01T10:00:00", "dateType": "Available"}, + ], + (datetime.date(2000, 12, 1), 12, 2000), + ), + Case( + "allow different date formats, Available > Updated", + [ + {"date": "2010-01-01T10:00:00Z", "dateType": "Updated"}, + {"date": "2000-12-01T10:00:00Z", "dateType": "Available"}, + ], + (datetime.date(2000, 12, 1), 12, 2000), + ), + Case( + "allow fuzzy date formats, Available > Updated", + [ + {"date": "2010", "dateType": "Updated"}, + {"date": "2000 Dec 01", "dateType": "Available"}, + ], + (datetime.date(2000, 12, 1), 12, 2000), + ), + Case( + "fuzzy year only", + [{"date": "Year 2010", "dateType": "Issued"},], + (None, None, 2010), + ), + Case( + "fuzzy year and month", + [{"date": "Year 2010 Feb", "dateType": "Issued"},], + (None, 2, 2010), + ), + Case( + "fuzzy year, month, day", + [{"date": "Year 2010 Feb 24", "dateType": "Issued"},], + (datetime.date(2010, 2, 24), 2, 2010), + ), + Case( + "ignore broken date", + [{"date": "Febrrr 45", "dateType": "Updated"},], + (None, None, None), + ), ] for case in cases: result = parse_datacite_dates(case.input) assert result == case.result, case.about + def test_datacite_importer(datacite_importer): last_index = datacite_importer.api.get_changelog(limit=1)[0].index - with open('tests/files/datacite_sample.jsonl', 'r') as f: + with open("tests/files/datacite_sample.jsonl", "r") as f: datacite_importer.bezerk_mode = True counts = JsonLinePusher(datacite_importer, f).run() - assert counts['insert'] == 1 - assert counts['exists'] == 0 - assert counts['skip'] == 0 + assert counts["insert"] == 1 + assert counts["exists"] == 0 + assert counts["skip"] == 0 # fetch most recent editgroup - change = datacite_importer.api.get_changelog_entry(index=last_index+1) + change = datacite_importer.api.get_changelog_entry(index=last_index + 1) eg = change.editgroup assert eg.description assert "datacite" in eg.description.lower() - assert eg.extra['git_rev'] - assert "fatcat_tools.DataciteImporter" in eg.extra['agent'] + assert eg.extra["git_rev"] + assert "fatcat_tools.DataciteImporter" in eg.extra["agent"] last_index = datacite_importer.api.get_changelog(limit=1)[0].index - with open('tests/files/datacite_sample.jsonl', 'r') as f: + with open("tests/files/datacite_sample.jsonl", "r") as f: datacite_importer.bezerk_mode = False datacite_importer.reset() counts = JsonLinePusher(datacite_importer, f).run() - assert counts['insert'] == 0 - assert counts['exists'] == 1 - assert counts['skip'] == 0 + assert counts["insert"] == 0 + assert counts["exists"] == 1 + assert counts["skip"] == 0 assert last_index == datacite_importer.api.get_changelog(limit=1)[0].index + def test_datacite_dict_parse(datacite_importer): - with open('tests/files/datacite_sample.jsonl', 'r') as f: + with open("tests/files/datacite_sample.jsonl", "r") as f: raw = json.load(f) r = datacite_importer.parse_record(raw) # ensure the API server is ok with format @@ -258,7 +356,9 @@ def test_datacite_dict_parse(datacite_importer): print(r.extra) assert r.title == "Triticum turgidum L. subsp. durum (Desf.) Husn. 97090" - assert r.publisher == "International Centre for Agricultural Research in Dry Areas" + assert ( + r.publisher == "International Centre for Agricultural Research in Dry Areas" + ) assert r.release_type == "article" assert r.release_stage == "published" assert r.license_slug == None @@ -269,13 +369,15 @@ def test_datacite_dict_parse(datacite_importer): assert r.subtitle == None assert r.release_date == None assert r.release_year == 1986 - assert 'subtitle' not in r.extra - assert 'subtitle' not in r.extra['datacite'] - assert 'funder' not in r.extra - assert 'funder' not in r.extra['datacite'] + assert "subtitle" not in r.extra + assert "subtitle" not in r.extra["datacite"] + assert "funder" not in r.extra + assert "funder" not in r.extra["datacite"] # matched by ISSN, so shouldn't be in there - #assert extra['container_name'] == "International Journal of Quantum Chemistry" - assert r.extra['datacite']['subjects'] == [{'subject': 'Plant Genetic Resource for Food and Agriculture'}] + # assert extra['container_name'] == "International Journal of Quantum Chemistry" + assert r.extra["datacite"]["subjects"] == [ + {"subject": "Plant Genetic Resource for Food and Agriculture"} + ] assert len(r.abstracts) == 1 assert len(r.abstracts[0].content) == 421 assert len(r.contribs) == 2 @@ -284,34 +386,41 @@ def test_datacite_dict_parse(datacite_importer): assert r.contribs[0].surname == None assert len(r.refs) == 0 + def test_datacite_conversions(datacite_importer): """ Datacite JSON to release entity JSON representation. The count is hardcoded for now. """ datacite_importer.debug = True - for i in range(33): - src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i) - dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i) - with open(src, 'r') as f: + for i in range(35): + src = "tests/files/datacite/datacite_doc_{0:02d}.json".format(i) + dst = "tests/files/datacite/datacite_result_{0:02d}.json".format(i) + with open(src, "r") as f: re = datacite_importer.parse_record(json.load(f)) result = entity_to_dict(re) - with open(dst, 'r') as f: + with open(dst, "r") as f: expected = json.loads(f.read()) - assert result == expected, 'output mismatch in {}'.format(dst) + assert result == expected, "output mismatch in {}".format(dst) + def test_index_form_to_display_name(): - Case = collections.namedtuple('Case', 'input output') + Case = collections.namedtuple("Case", "input output") cases = [ - Case('', ''), - Case('ABC', 'ABC'), - Case('International Space Station', 'International Space Station'), - Case('Jin, Shan', 'Shan Jin'), - Case('Volkshochschule Der Bundesstadt Bonn', 'Volkshochschule Der Bundesstadt Bonn'), - Case('Solomon, P. M.', 'P. M. Solomon'), - Case('Sujeevan Ratnasingham', 'Sujeevan Ratnasingham'), - Case('Paul Stöckli (1906-1991), Künstler', 'Paul Stöckli (1906-1991), Künstler'), + Case("", ""), + Case("ABC", "ABC"), + Case("International Space Station", "International Space Station"), + Case("Jin, Shan", "Shan Jin"), + Case( + "Volkshochschule Der Bundesstadt Bonn", + "Volkshochschule Der Bundesstadt Bonn", + ), + Case("Solomon, P. M.", "P. M. Solomon"), + Case("Sujeevan Ratnasingham", "Sujeevan Ratnasingham"), + Case( + "Paul Stöckli (1906-1991), Künstler", "Paul Stöckli (1906-1991), Künstler" + ), ] for c in cases: @@ -319,45 +428,69 @@ def test_index_form_to_display_name(): def test_lookup_license_slug(): - Case = collections.namedtuple('Case', 'input output') + Case = collections.namedtuple("Case", "input output") cases = [ - Case('https://opensource.org/licenses/MIT', 'MIT'), - Case('creativecommons.org/licenses/by-nc-nd/3.0/', 'CC-BY-NC-ND'), - Case('http://creativecommons.org/licences/by-nc-sa/4.0', 'CC-BY-NC-SA'), - Case('http://creativecommons.org/licenses/by-nc-nd/2.5/co', 'CC-BY-NC-ND'), - Case('http://creativecommons.org/licenses/by-nd/4.0/legalcode', 'CC-BY-ND'), - Case('http://creativecommons.org/licenses/by/2.0/uk/legalcode', 'CC-BY'), - Case('http://creativecommons.org/publicdomain/zero/1.0/legalcode', 'CC-0'), - Case('http://doi.wiley.com/10.1002/tdm_license_1.1', 'WILEY-TDM-1.1'), - Case('http://homepage.data-planet.com/terms-use', 'SAGE-DATA-PLANET'), - Case('http://www.springer.com/tdm', 'SPRINGER-TDM'), - Case('https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess.xhtml', 'ADS-UK'), - Case('https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess', 'ADS-UK'), - Case('https://creativecommons.org/public-domain/cc0', 'CC-0'), - Case('https://creativecommons.org/publicdomain/zero/1.0', 'CC-0'), - Case('https://creativecommons.org/share-your-work/public-domain/cc0', 'CC-0'), - Case('https://www.elsevier.com/tdm/userlicense/1.0', 'ELSEVIER-USER-1.0'), - Case('https://www.gnu.org/licenses/gpl-3.0.html', 'GPL-3.0'), - Case('http://rightsstatements.org/page/InC/1.0?language=en', 'RS-INC'), - Case('http://onlinelibrary.wiley.com/termsAndConditions', 'WILEY'), - Case('https://publikationen.bibliothek.kit.edu/kitopen-lizenz', 'KIT-OPEN'), - Case('http://journals.sagepub.com/page/policies/text-and-data-mining-license', 'SAGE-TDM'), - Case('https://creativecommons.org/publicdomain/mark/1.0/deed.de', 'CC-PUBLICDOMAIN'), - Case('http://creativecommons.org/publicdomain/mark/1.0', 'CC-PUBLICDOMAIN'), - Case('https://creativecommons.org/publicdomain/mark/1.0', 'CC-PUBLICDOMAIN'), - Case('https://creativecommons.org/publicdomain/mark/1.0/', 'CC-PUBLICDOMAIN'), - Case('https://creativecommons.org/publicdomain/mark/1.0/deed.de', 'CC-PUBLICDOMAIN'), - Case('https://creativecommons.org/share-your-work/public-domain/cc0/', 'CC-0'), - Case('http://spdx.org/licenses/CC0-1.0.json', 'CC-0'), - Case('http://spdx.org/licenses/CC-BY-1.0.json', 'CC-BY'), - Case('http://spdx.org/licenses/CC-BY-4.0.json', 'CC-BY'), - Case('http://spdx.org/licenses/CC-BY-NC-4.0.json', 'CC-BY-NC'), - Case('http://spdx.org/licenses/CC-BY-SA-3.0.json', 'CC-BY-SA'), - Case('http://spdx.org/licenses/CC-BY-SA-4.0.json', 'CC-BY-SA'), - Case('http://spdx.org/licenses/MIT.json', 'MIT'), - Case('http://spdx.org/licenses/OGL-Canada-2.0.json', 'OGL-CANADA'), + Case("https://opensource.org/licenses/MIT", "MIT"), + Case("creativecommons.org/licenses/by-nc-nd/3.0/", "CC-BY-NC-ND"), + Case("http://creativecommons.org/licences/by-nc-sa/4.0", "CC-BY-NC-SA"), + Case("http://creativecommons.org/licenses/by-nc-nd/2.5/co", "CC-BY-NC-ND"), + Case("http://creativecommons.org/licenses/by-nd/4.0/legalcode", "CC-BY-ND"), + Case("http://creativecommons.org/licenses/by/2.0/uk/legalcode", "CC-BY"), + Case("http://creativecommons.org/publicdomain/zero/1.0/legalcode", "CC-0"), + Case("http://doi.wiley.com/10.1002/tdm_license_1.1", "WILEY-TDM-1.1"), + Case("http://homepage.data-planet.com/terms-use", "SAGE-DATA-PLANET"), + Case("http://www.springer.com/tdm", "SPRINGER-TDM"), + Case( + "https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess.xhtml", + "ADS-UK", + ), + Case( + "https://archaeologydataservice.ac.uk/advice/termsOfUseAndAccess", "ADS-UK" + ), + Case("https://creativecommons.org/public-domain/cc0", "CC-0"), + Case("https://creativecommons.org/publicdomain/zero/1.0", "CC-0"), + Case("https://creativecommons.org/share-your-work/public-domain/cc0", "CC-0"), + Case("https://www.elsevier.com/tdm/userlicense/1.0", "ELSEVIER-USER-1.0"), + Case("https://www.gnu.org/licenses/gpl-3.0.html", "GPL-3.0"), + Case("http://rightsstatements.org/page/InC/1.0?language=en", "RS-INC"), + Case("http://onlinelibrary.wiley.com/termsAndConditions", "WILEY"), + Case("https://publikationen.bibliothek.kit.edu/kitopen-lizenz", "KIT-OPEN"), + Case( + "http://journals.sagepub.com/page/policies/text-and-data-mining-license", + "SAGE-TDM", + ), + Case( + "https://creativecommons.org/publicdomain/mark/1.0/deed.de", + "CC-PUBLICDOMAIN", + ), + Case("http://creativecommons.org/publicdomain/mark/1.0", "CC-PUBLICDOMAIN"), + Case("https://creativecommons.org/publicdomain/mark/1.0", "CC-PUBLICDOMAIN"), + Case("https://creativecommons.org/publicdomain/mark/1.0/", "CC-PUBLICDOMAIN"), + Case( + "https://creativecommons.org/publicdomain/mark/1.0/deed.de", + "CC-PUBLICDOMAIN", + ), + Case("https://creativecommons.org/share-your-work/public-domain/cc0/", "CC-0"), + Case("http://spdx.org/licenses/CC0-1.0.json", "CC-0"), + Case("http://spdx.org/licenses/CC-BY-1.0.json", "CC-BY"), + Case("http://spdx.org/licenses/CC-BY-4.0.json", "CC-BY"), + Case("http://spdx.org/licenses/CC-BY-NC-4.0.json", "CC-BY-NC"), + Case("http://spdx.org/licenses/CC-BY-SA-3.0.json", "CC-BY-SA"), + Case("http://spdx.org/licenses/CC-BY-SA-4.0.json", "CC-BY-SA"), + Case("http://spdx.org/licenses/MIT.json", "MIT"), + Case("http://spdx.org/licenses/OGL-Canada-2.0.json", "OGL-CANADA"), ] for c in cases: got = lookup_license_slug(c.input) - assert c.output == got, '{}: got {}, want {}'.format(c.input, got, c.output) + assert c.output == got, "{}: got {}, want {}".format(c.input, got, c.output) + + +def test_contributor_list_contains_contributor(): + Case = collections.namedtuple("Case", "contrib_list contrib want") + cases = [ + Case([], fatcat_openapi_client.ReleaseContrib(raw_name="Paul Katz"), False), + ] + for c in cases: + got = contributor_list_contains_contributor(c.contrib_list, c.contrib) + assert got == c.want |