From 62d6a7e48d6bea1bc7f451c6043f38aee2051f9b Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 8 Jan 2020 22:33:58 +0100 Subject: datacite: factor out contributor handling Use values from: * attributes.creators[] * attributes.contributors[] --- python/fatcat_tools/importers/datacite.py | 183 ++++++++++++--------- python/tests/files/datacite/datacite_doc_26.json | 57 +++++++ .../tests/files/datacite/datacite_result_05.json | 6 + .../tests/files/datacite/datacite_result_09.json | 11 ++ .../tests/files/datacite/datacite_result_26.json | 31 ++++ python/tests/import_datacite.py | 4 +- 6 files changed, 210 insertions(+), 82 deletions(-) create mode 100644 python/tests/files/datacite/datacite_doc_26.json create mode 100644 python/tests/files/datacite/datacite_result_26.json diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index fc986994..9ca72758 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -303,88 +303,11 @@ class DataciteImporter(EntityImporter): print('[{}] skipping non-ascii doi for now'.format(doi)) return None - # Contributors. Many nameIdentifierSchemes, we do not use (yet): - # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": - # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID", - # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"]. - contribs = [] - - # Names, that should be ignored right away. - name_blacklist = set(('Occdownload Gbif.Org',)) - - for i, c in enumerate(attributes['creators']): - nameType = c.get('nameType', '') or '' - if nameType in ('', 'Personal'): - creator_id = None - for nid in c.get('nameIdentifiers', []): - name_scheme = nid.get('nameIdentifierScheme', '') or '' - if not name_scheme.lower() == "orcid": - continue - orcid = nid.get('nameIdentifier', - '').replace('https://orcid.org/', '') - if not orcid: - continue - creator_id = self.lookup_orcid(orcid) - # TODO(martin): If creator_id is None, should we create creators? - - # If there are multiple affiliation strings, use the first one. - affiliations = c.get('affiliation', []) or [] - raw_affiliation = None - if len(affiliations) == 0: - raw_affiliation = None - else: - raw_affiliation = clean(affiliations[0]) - - name = c.get('name') - given_name = c.get('givenName') - surname = c.get('familyName') - - if name: - name = clean(name) - - if name in name_blacklist: - continue - - if given_name: - given_name = clean(given_name) - - if surname: - surname = clean(surname) - - if not name: - continue - - if raw_affiliation == '': - continue - if name.lower() in UNKNOWN_MARKERS: - continue + creators = attributes.get('creators', []) or [] + contributors = attributes.get('contributors', []) or [] # Much fewer than creators. - # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'. - if name: - name = index_form_to_display_name(name) - - contribs.append( - fatcat_openapi_client.ReleaseContrib( - creator_id=creator_id, - index=i, - raw_name=name, - given_name=given_name, - surname=surname, - role='author', - raw_affiliation=raw_affiliation, - )) - elif nameType == 'Organizational': - name = c.get('name', '') or '' - if name in UNKNOWN_MARKERS: - continue - if len(name) < 3: - continue - extra = {'organization': name} - contribs.append(fatcat_openapi_client.ReleaseContrib( - index=i, extra=extra)) - else: - print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr) + contribs = self.parse_datacite_creators(creators) + self.parse_datacite_creators(contributors, role=None, set_index=False) # Title, may come with "attributes.titles[].titleType", like # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle" @@ -767,6 +690,104 @@ class DataciteImporter(EntityImporter): extra=self.editgroup_extra), entity_list=batch)) + def parse_datacite_creators(self, creators, role='author', set_index=True): + """ + Parses a list of creators into a list of ReleaseContrib objects. Set + set_index to False, if the index contrib field should be left blank. + """ + # Contributors. Many nameIdentifierSchemes, we do not use (yet): + # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": + # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID", + # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"]. + contribs = [] + + # Names, that should be ignored right away. + name_blacklist = set(('Occdownload Gbif.Org',)) + + for i, c in enumerate(creators): + if not set_index: + i = None + nameType = c.get('nameType', '') or '' + if nameType in ('', 'Personal'): + creator_id = None + for nid in c.get('nameIdentifiers', []): + name_scheme = nid.get('nameIdentifierScheme', '') or '' + if not name_scheme.lower() == "orcid": + continue + orcid = nid.get('nameIdentifier', '').replace('https://orcid.org/', '') + if not orcid: + continue + creator_id = self.lookup_orcid(orcid) + # TODO(martin): If creator_id is None, should we create creators? + + # If there are multiple affiliation strings, use the first one. + affiliations = c.get('affiliation', []) or [] + raw_affiliation = None + if len(affiliations) == 0: + raw_affiliation = None + else: + raw_affiliation = clean(affiliations[0]) + + name = c.get('name') + given_name = c.get('givenName') + surname = c.get('familyName') + + if name: + name = clean(name) + if not name: + continue + if name in name_blacklist: + continue + if name.lower() in UNKNOWN_MARKERS: + continue + # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'. + if name: + name = index_form_to_display_name(name) + + if given_name: + given_name = clean(given_name) + if surname: + surname = clean(surname) + if raw_affiliation == '': + continue + + extra = None + + # "DataManager", "DataCurator", "ContactPerson", "Distributor", + # "RegistrationAgency", "Sponsor", "Researcher", + # "RelatedPerson", "ProjectLeader", "Editor", "Other", + # "ProjectMember", "Funder", "RightsHolder", "DataCollector", + # "Supervisor", "Producer", "HostingInstitution", "ResearchGroup" + contributorType = c.get('contributorType', '') or '' + + if contributorType: + extra = {'type': contributorType} + + contribs.append( + fatcat_openapi_client.ReleaseContrib( + creator_id=creator_id, + index=i, + raw_name=name, + given_name=given_name, + surname=surname, + role=role, + raw_affiliation=raw_affiliation, + extra=extra, + )) + elif nameType == 'Organizational': + name = c.get('name', '') or '' + if name in UNKNOWN_MARKERS: + continue + if len(name) < 3: + continue + extra = {'organization': name} + contribs.append(fatcat_openapi_client.ReleaseContrib( + index=i, extra=extra)) + else: + print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr) + + return contribs + def lookup_license_slug(raw): """ @@ -971,6 +992,8 @@ def index_form_to_display_name(s): if s.count(',') > 1: # "Dr. Hina, Dr. Muhammad Usman Shahid, Dr. Muhammad Zeeshan Khan" return s + + # Not names, but sprinkled in fields where authors live. stopwords = [s.lower() for s in ( 'Archive', 'Collection', diff --git a/python/tests/files/datacite/datacite_doc_26.json b/python/tests/files/datacite/datacite_doc_26.json new file mode 100644 index 00000000..c2abb1b2 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_26.json @@ -0,0 +1,57 @@ +{ + "attributes": { + "doi": "10.7916/d86x0cg1", + "creators": [ + { + "name": "Anton Welch", + "affiliation": [ + "Department of pataphysics" + ], + "nameIdentifiers": [] + } + ], + "contributors": [ + { + "name": "Wemmer, David", + "nameType": "Personal", + "givenName": "David", + "familyName": "Wemmer", + "affiliation": [], + "contributorType": "Editor" + } + ], + "titles": [ + { + "title": "Additional file 123: ABC" + }, + { + "title": "DEF", + "titleType": "Subtitle" + } + ], + "publicationYear": 2016, + "language": "DE-CH", + "types": { + "ris": "GEN", + "bibtex": "misc", + "citeproc": "article", + "schemaOrg": "CreativeWork" + }, + "dates": [ + { + "date": "2017-08-24", + "dateType": "Created" + }, + { + "date": "2019-08-04", + "dateType": "Updated" + }, + { + "date": "2017", + "dateType": "Issued" + } + ], + "isActive": true, + "state": "findable" + } +} diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json index 22542a10..c4e5418d 100644 --- a/python/tests/files/datacite/datacite_result_05.json +++ b/python/tests/files/datacite/datacite_result_05.json @@ -523,6 +523,12 @@ "given_name": "Christian", "surname": "Wurzbacher", "role": "author" + }, + { + "raw_name": "Kessy Abarenkov" + }, + { + "raw_name": "NHM UT-University Of Tartu; Natural History Museum And Botanic Garden" } ], "refs": [], diff --git a/python/tests/files/datacite/datacite_result_09.json b/python/tests/files/datacite/datacite_result_09.json index fd873309..c93dc769 100644 --- a/python/tests/files/datacite/datacite_result_09.json +++ b/python/tests/files/datacite/datacite_result_09.json @@ -32,6 +32,17 @@ "given_name": "Nils", "surname": "Kirstaedter", "role": "author" + }, + { + "extra": { + "organization": "TIB-Technische Informationsbibliothek Universitätsbibliothek Hannover" + } + }, + { + "raw_name": "Technische Informationsbibliothek (TIB)", + "extra": { + "type": "DataManager" + } } ], "refs": [], diff --git a/python/tests/files/datacite/datacite_result_26.json b/python/tests/files/datacite/datacite_result_26.json new file mode 100644 index 00000000..8d26197c --- /dev/null +++ b/python/tests/files/datacite/datacite_result_26.json @@ -0,0 +1,31 @@ +{ + "extra": { + "datacite": {}, + "release_month": 8 + }, + "title": "Additional file 123: ABC", + "subtitle": "DEF", + "release_type": "stub", + "release_stage": "published", + "release_date": "2017-08-24", + "release_year": 2017, + "ext_ids": { + "doi": "10.7916/d86x0cg1" + }, + "contribs": [ + { + "index": 0, + "raw_name": "Anton Welch", + "role": "author", + "raw_affiliation": "Department of pataphysics" + }, + { + "extra": {"type": "Editor"}, + "raw_name": "David Wemmer", + "given_name": "David", + "surname": "Wemmer" + } + ], + "refs": [], + "abstracts": [] +} diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 7293ecac..5ad7ef2c 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -275,7 +275,7 @@ def test_datacite_dict_parse(datacite_importer): assert r.extra['datacite']['subjects'] == [{'subject': 'Plant Genetic Resource for Food and Agriculture'}] assert len(r.abstracts) == 1 assert len(r.abstracts[0].content) == 421 - assert len(r.contribs) == 1 + assert len(r.contribs) == 2 assert r.contribs[0].raw_name == "GLIS Of The ITPGRFA" assert r.contribs[0].given_name == None assert r.contribs[0].surname == None @@ -287,7 +287,7 @@ def test_datacite_conversions(datacite_importer): for now. """ datacite_importer.debug = True - for i in range(26): + for i in range(27): src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i) dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i) print('testing mapping from {} => {}'.format(src, dst)) -- cgit v1.2.3