aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-01-08 22:33:58 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-01-08 22:33:58 +0100
commit62d6a7e48d6bea1bc7f451c6043f38aee2051f9b (patch)
treec28079d6f3a12f106607fb6fc1fa9a4a5c83d9de
parentb7a325360ca8ae3107411e9e1966d93b999bbb52 (diff)
downloadfatcat-62d6a7e48d6bea1bc7f451c6043f38aee2051f9b.tar.gz
fatcat-62d6a7e48d6bea1bc7f451c6043f38aee2051f9b.zip
datacite: factor out contributor handling
Use values from: * attributes.creators[] * attributes.contributors[]
-rw-r--r--python/fatcat_tools/importers/datacite.py183
-rw-r--r--python/tests/files/datacite/datacite_doc_26.json57
-rw-r--r--python/tests/files/datacite/datacite_result_05.json6
-rw-r--r--python/tests/files/datacite/datacite_result_09.json11
-rw-r--r--python/tests/files/datacite/datacite_result_26.json31
-rw-r--r--python/tests/import_datacite.py4
6 files changed, 210 insertions, 82 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index fc986994..9ca72758 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -303,88 +303,11 @@ class DataciteImporter(EntityImporter):
print('[{}] skipping non-ascii doi for now'.format(doi))
return None
- # Contributors. Many nameIdentifierSchemes, we do not use (yet):
- # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme":
- # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID",
- # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"].
- contribs = []
-
- # Names, that should be ignored right away.
- name_blacklist = set(('Occdownload Gbif.Org',))
-
- for i, c in enumerate(attributes['creators']):
- nameType = c.get('nameType', '') or ''
- if nameType in ('', 'Personal'):
- creator_id = None
- for nid in c.get('nameIdentifiers', []):
- name_scheme = nid.get('nameIdentifierScheme', '') or ''
- if not name_scheme.lower() == "orcid":
- continue
- orcid = nid.get('nameIdentifier',
- '').replace('https://orcid.org/', '')
- if not orcid:
- continue
- creator_id = self.lookup_orcid(orcid)
- # TODO(martin): If creator_id is None, should we create creators?
-
- # If there are multiple affiliation strings, use the first one.
- affiliations = c.get('affiliation', []) or []
- raw_affiliation = None
- if len(affiliations) == 0:
- raw_affiliation = None
- else:
- raw_affiliation = clean(affiliations[0])
-
- name = c.get('name')
- given_name = c.get('givenName')
- surname = c.get('familyName')
-
- if name:
- name = clean(name)
-
- if name in name_blacklist:
- continue
-
- if given_name:
- given_name = clean(given_name)
-
- if surname:
- surname = clean(surname)
-
- if not name:
- continue
-
- if raw_affiliation == '':
- continue
- if name.lower() in UNKNOWN_MARKERS:
- continue
+ creators = attributes.get('creators', []) or []
+ contributors = attributes.get('contributors', []) or [] # Much fewer than creators.
- # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'.
- if name:
- name = index_form_to_display_name(name)
-
- contribs.append(
- fatcat_openapi_client.ReleaseContrib(
- creator_id=creator_id,
- index=i,
- raw_name=name,
- given_name=given_name,
- surname=surname,
- role='author',
- raw_affiliation=raw_affiliation,
- ))
- elif nameType == 'Organizational':
- name = c.get('name', '') or ''
- if name in UNKNOWN_MARKERS:
- continue
- if len(name) < 3:
- continue
- extra = {'organization': name}
- contribs.append(fatcat_openapi_client.ReleaseContrib(
- index=i, extra=extra))
- else:
- print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr)
+ contribs = self.parse_datacite_creators(creators) + self.parse_datacite_creators(contributors, role=None, set_index=False)
# Title, may come with "attributes.titles[].titleType", like
# "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"
@@ -767,6 +690,104 @@ class DataciteImporter(EntityImporter):
extra=self.editgroup_extra),
entity_list=batch))
+ def parse_datacite_creators(self, creators, role='author', set_index=True):
+ """
+ Parses a list of creators into a list of ReleaseContrib objects. Set
+ set_index to False, if the index contrib field should be left blank.
+ """
+ # Contributors. Many nameIdentifierSchemes, we do not use (yet):
+ # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme":
+ # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID",
+ # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"].
+ contribs = []
+
+ # Names, that should be ignored right away.
+ name_blacklist = set(('Occdownload Gbif.Org',))
+
+ for i, c in enumerate(creators):
+ if not set_index:
+ i = None
+ nameType = c.get('nameType', '') or ''
+ if nameType in ('', 'Personal'):
+ creator_id = None
+ for nid in c.get('nameIdentifiers', []):
+ name_scheme = nid.get('nameIdentifierScheme', '') or ''
+ if not name_scheme.lower() == "orcid":
+ continue
+ orcid = nid.get('nameIdentifier', '').replace('https://orcid.org/', '')
+ if not orcid:
+ continue
+ creator_id = self.lookup_orcid(orcid)
+ # TODO(martin): If creator_id is None, should we create creators?
+
+ # If there are multiple affiliation strings, use the first one.
+ affiliations = c.get('affiliation', []) or []
+ raw_affiliation = None
+ if len(affiliations) == 0:
+ raw_affiliation = None
+ else:
+ raw_affiliation = clean(affiliations[0])
+
+ name = c.get('name')
+ given_name = c.get('givenName')
+ surname = c.get('familyName')
+
+ if name:
+ name = clean(name)
+ if not name:
+ continue
+ if name in name_blacklist:
+ continue
+ if name.lower() in UNKNOWN_MARKERS:
+ continue
+ # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'.
+ if name:
+ name = index_form_to_display_name(name)
+
+ if given_name:
+ given_name = clean(given_name)
+ if surname:
+ surname = clean(surname)
+ if raw_affiliation == '':
+ continue
+
+ extra = None
+
+ # "DataManager", "DataCurator", "ContactPerson", "Distributor",
+ # "RegistrationAgency", "Sponsor", "Researcher",
+ # "RelatedPerson", "ProjectLeader", "Editor", "Other",
+ # "ProjectMember", "Funder", "RightsHolder", "DataCollector",
+ # "Supervisor", "Producer", "HostingInstitution", "ResearchGroup"
+ contributorType = c.get('contributorType', '') or ''
+
+ if contributorType:
+ extra = {'type': contributorType}
+
+ contribs.append(
+ fatcat_openapi_client.ReleaseContrib(
+ creator_id=creator_id,
+ index=i,
+ raw_name=name,
+ given_name=given_name,
+ surname=surname,
+ role=role,
+ raw_affiliation=raw_affiliation,
+ extra=extra,
+ ))
+ elif nameType == 'Organizational':
+ name = c.get('name', '') or ''
+ if name in UNKNOWN_MARKERS:
+ continue
+ if len(name) < 3:
+ continue
+ extra = {'organization': name}
+ contribs.append(fatcat_openapi_client.ReleaseContrib(
+ index=i, extra=extra))
+ else:
+ print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr)
+
+ return contribs
+
def lookup_license_slug(raw):
"""
@@ -971,6 +992,8 @@ def index_form_to_display_name(s):
if s.count(',') > 1:
# "Dr. Hina, Dr. Muhammad Usman Shahid, Dr. Muhammad Zeeshan Khan"
return s
+
+ # Not names, but sprinkled in fields where authors live.
stopwords = [s.lower() for s in (
'Archive',
'Collection',
diff --git a/python/tests/files/datacite/datacite_doc_26.json b/python/tests/files/datacite/datacite_doc_26.json
new file mode 100644
index 00000000..c2abb1b2
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_26.json
@@ -0,0 +1,57 @@
+{
+ "attributes": {
+ "doi": "10.7916/d86x0cg1",
+ "creators": [
+ {
+ "name": "Anton Welch",
+ "affiliation": [
+ "Department of pataphysics"
+ ],
+ "nameIdentifiers": []
+ }
+ ],
+ "contributors": [
+ {
+ "name": "Wemmer, David",
+ "nameType": "Personal",
+ "givenName": "David",
+ "familyName": "Wemmer",
+ "affiliation": [],
+ "contributorType": "Editor"
+ }
+ ],
+ "titles": [
+ {
+ "title": "Additional file 123: ABC"
+ },
+ {
+ "title": "DEF",
+ "titleType": "Subtitle"
+ }
+ ],
+ "publicationYear": 2016,
+ "language": "DE-CH",
+ "types": {
+ "ris": "GEN",
+ "bibtex": "misc",
+ "citeproc": "article",
+ "schemaOrg": "CreativeWork"
+ },
+ "dates": [
+ {
+ "date": "2017-08-24",
+ "dateType": "Created"
+ },
+ {
+ "date": "2019-08-04",
+ "dateType": "Updated"
+ },
+ {
+ "date": "2017",
+ "dateType": "Issued"
+ }
+ ],
+ "isActive": true,
+ "state": "findable"
+ }
+}
diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json
index 22542a10..c4e5418d 100644
--- a/python/tests/files/datacite/datacite_result_05.json
+++ b/python/tests/files/datacite/datacite_result_05.json
@@ -523,6 +523,12 @@
"given_name": "Christian",
"surname": "Wurzbacher",
"role": "author"
+ },
+ {
+ "raw_name": "Kessy Abarenkov"
+ },
+ {
+ "raw_name": "NHM UT-University Of Tartu; Natural History Museum And Botanic Garden"
}
],
"refs": [],
diff --git a/python/tests/files/datacite/datacite_result_09.json b/python/tests/files/datacite/datacite_result_09.json
index fd873309..c93dc769 100644
--- a/python/tests/files/datacite/datacite_result_09.json
+++ b/python/tests/files/datacite/datacite_result_09.json
@@ -32,6 +32,17 @@
"given_name": "Nils",
"surname": "Kirstaedter",
"role": "author"
+ },
+ {
+ "extra": {
+ "organization": "TIB-Technische Informationsbibliothek Universitätsbibliothek Hannover"
+ }
+ },
+ {
+ "raw_name": "Technische Informationsbibliothek (TIB)",
+ "extra": {
+ "type": "DataManager"
+ }
}
],
"refs": [],
diff --git a/python/tests/files/datacite/datacite_result_26.json b/python/tests/files/datacite/datacite_result_26.json
new file mode 100644
index 00000000..8d26197c
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_26.json
@@ -0,0 +1,31 @@
+{
+ "extra": {
+ "datacite": {},
+ "release_month": 8
+ },
+ "title": "Additional file 123: ABC",
+ "subtitle": "DEF",
+ "release_type": "stub",
+ "release_stage": "published",
+ "release_date": "2017-08-24",
+ "release_year": 2017,
+ "ext_ids": {
+ "doi": "10.7916/d86x0cg1"
+ },
+ "contribs": [
+ {
+ "index": 0,
+ "raw_name": "Anton Welch",
+ "role": "author",
+ "raw_affiliation": "Department of pataphysics"
+ },
+ {
+ "extra": {"type": "Editor"},
+ "raw_name": "David Wemmer",
+ "given_name": "David",
+ "surname": "Wemmer"
+ }
+ ],
+ "refs": [],
+ "abstracts": []
+}
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 7293ecac..5ad7ef2c 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -275,7 +275,7 @@ def test_datacite_dict_parse(datacite_importer):
assert r.extra['datacite']['subjects'] == [{'subject': 'Plant Genetic Resource for Food and Agriculture'}]
assert len(r.abstracts) == 1
assert len(r.abstracts[0].content) == 421
- assert len(r.contribs) == 1
+ assert len(r.contribs) == 2
assert r.contribs[0].raw_name == "GLIS Of The ITPGRFA"
assert r.contribs[0].given_name == None
assert r.contribs[0].surname == None
@@ -287,7 +287,7 @@ def test_datacite_conversions(datacite_importer):
for now.
"""
datacite_importer.debug = True
- for i in range(26):
+ for i in range(27):
src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i)
dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i)
print('testing mapping from {} => {}'.format(src, dst))