summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-01-08 22:33:58 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-01-08 22:33:58 +0100
commit62d6a7e48d6bea1bc7f451c6043f38aee2051f9b (patch)
treec28079d6f3a12f106607fb6fc1fa9a4a5c83d9de /python/fatcat_tools
parentb7a325360ca8ae3107411e9e1966d93b999bbb52 (diff)
downloadfatcat-62d6a7e48d6bea1bc7f451c6043f38aee2051f9b.tar.gz
fatcat-62d6a7e48d6bea1bc7f451c6043f38aee2051f9b.zip
datacite: factor out contributor handling
Use values from: * attributes.creators[] * attributes.contributors[]
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/importers/datacite.py183
1 files changed, 103 insertions, 80 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index fc986994..9ca72758 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -303,88 +303,11 @@ class DataciteImporter(EntityImporter):
print('[{}] skipping non-ascii doi for now'.format(doi))
return None
- # Contributors. Many nameIdentifierSchemes, we do not use (yet):
- # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme":
- # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID",
- # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"].
- contribs = []
-
- # Names, that should be ignored right away.
- name_blacklist = set(('Occdownload Gbif.Org',))
-
- for i, c in enumerate(attributes['creators']):
- nameType = c.get('nameType', '') or ''
- if nameType in ('', 'Personal'):
- creator_id = None
- for nid in c.get('nameIdentifiers', []):
- name_scheme = nid.get('nameIdentifierScheme', '') or ''
- if not name_scheme.lower() == "orcid":
- continue
- orcid = nid.get('nameIdentifier',
- '').replace('https://orcid.org/', '')
- if not orcid:
- continue
- creator_id = self.lookup_orcid(orcid)
- # TODO(martin): If creator_id is None, should we create creators?
-
- # If there are multiple affiliation strings, use the first one.
- affiliations = c.get('affiliation', []) or []
- raw_affiliation = None
- if len(affiliations) == 0:
- raw_affiliation = None
- else:
- raw_affiliation = clean(affiliations[0])
-
- name = c.get('name')
- given_name = c.get('givenName')
- surname = c.get('familyName')
-
- if name:
- name = clean(name)
-
- if name in name_blacklist:
- continue
-
- if given_name:
- given_name = clean(given_name)
-
- if surname:
- surname = clean(surname)
-
- if not name:
- continue
-
- if raw_affiliation == '':
- continue
- if name.lower() in UNKNOWN_MARKERS:
- continue
+ creators = attributes.get('creators', []) or []
+ contributors = attributes.get('contributors', []) or [] # Much fewer than creators.
- # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'.
- if name:
- name = index_form_to_display_name(name)
-
- contribs.append(
- fatcat_openapi_client.ReleaseContrib(
- creator_id=creator_id,
- index=i,
- raw_name=name,
- given_name=given_name,
- surname=surname,
- role='author',
- raw_affiliation=raw_affiliation,
- ))
- elif nameType == 'Organizational':
- name = c.get('name', '') or ''
- if name in UNKNOWN_MARKERS:
- continue
- if len(name) < 3:
- continue
- extra = {'organization': name}
- contribs.append(fatcat_openapi_client.ReleaseContrib(
- index=i, extra=extra))
- else:
- print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr)
+ contribs = self.parse_datacite_creators(creators) + self.parse_datacite_creators(contributors, role=None, set_index=False)
# Title, may come with "attributes.titles[].titleType", like
# "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"
@@ -767,6 +690,104 @@ class DataciteImporter(EntityImporter):
extra=self.editgroup_extra),
entity_list=batch))
+ def parse_datacite_creators(self, creators, role='author', set_index=True):
+ """
+ Parses a list of creators into a list of ReleaseContrib objects. Set
+ set_index to False, if the index contrib field should be left blank.
+ """
+ # Contributors. Many nameIdentifierSchemes, we do not use (yet):
+ # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme":
+ # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID",
+ # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"].
+ contribs = []
+
+ # Names, that should be ignored right away.
+ name_blacklist = set(('Occdownload Gbif.Org',))
+
+ for i, c in enumerate(creators):
+ if not set_index:
+ i = None
+ nameType = c.get('nameType', '') or ''
+ if nameType in ('', 'Personal'):
+ creator_id = None
+ for nid in c.get('nameIdentifiers', []):
+ name_scheme = nid.get('nameIdentifierScheme', '') or ''
+ if not name_scheme.lower() == "orcid":
+ continue
+ orcid = nid.get('nameIdentifier', '').replace('https://orcid.org/', '')
+ if not orcid:
+ continue
+ creator_id = self.lookup_orcid(orcid)
+ # TODO(martin): If creator_id is None, should we create creators?
+
+ # If there are multiple affiliation strings, use the first one.
+ affiliations = c.get('affiliation', []) or []
+ raw_affiliation = None
+ if len(affiliations) == 0:
+ raw_affiliation = None
+ else:
+ raw_affiliation = clean(affiliations[0])
+
+ name = c.get('name')
+ given_name = c.get('givenName')
+ surname = c.get('familyName')
+
+ if name:
+ name = clean(name)
+ if not name:
+ continue
+ if name in name_blacklist:
+ continue
+ if name.lower() in UNKNOWN_MARKERS:
+ continue
+ # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'.
+ if name:
+ name = index_form_to_display_name(name)
+
+ if given_name:
+ given_name = clean(given_name)
+ if surname:
+ surname = clean(surname)
+ if raw_affiliation == '':
+ continue
+
+ extra = None
+
+ # "DataManager", "DataCurator", "ContactPerson", "Distributor",
+ # "RegistrationAgency", "Sponsor", "Researcher",
+ # "RelatedPerson", "ProjectLeader", "Editor", "Other",
+ # "ProjectMember", "Funder", "RightsHolder", "DataCollector",
+ # "Supervisor", "Producer", "HostingInstitution", "ResearchGroup"
+ contributorType = c.get('contributorType', '') or ''
+
+ if contributorType:
+ extra = {'type': contributorType}
+
+ contribs.append(
+ fatcat_openapi_client.ReleaseContrib(
+ creator_id=creator_id,
+ index=i,
+ raw_name=name,
+ given_name=given_name,
+ surname=surname,
+ role=role,
+ raw_affiliation=raw_affiliation,
+ extra=extra,
+ ))
+ elif nameType == 'Organizational':
+ name = c.get('name', '') or ''
+ if name in UNKNOWN_MARKERS:
+ continue
+ if len(name) < 3:
+ continue
+ extra = {'organization': name}
+ contribs.append(fatcat_openapi_client.ReleaseContrib(
+ index=i, extra=extra))
+ else:
+ print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr)
+
+ return contribs
+
def lookup_license_slug(raw):
"""
@@ -971,6 +992,8 @@ def index_form_to_display_name(s):
if s.count(',') > 1:
# "Dr. Hina, Dr. Muhammad Usman Shahid, Dr. Muhammad Zeeshan Khan"
return s
+
+ # Not names, but sprinkled in fields where authors live.
stopwords = [s.lower() for s in (
'Archive',
'Collection',