summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
authorbnewbold <bnewbold@archive.org>2020-07-11 00:31:47 +0000
committerbnewbold <bnewbold@archive.org>2020-07-11 00:31:47 +0000
commitf5aefab6a6431ab9db99761457fd47b36b920b8c (patch)
treed144988d310aeecf8521cfc33aca9f0667dfedbc /python/fatcat_tools/importers
parent26b455ffad566bef58684a78654a2719c409588a (diff)
parent3c266e07771271241aa8cff3e3199a45109362af (diff)
downloadfatcat-f5aefab6a6431ab9db99761457fd47b36b920b8c.tar.gz
fatcat-f5aefab6a6431ab9db99761457fd47b36b920b8c.zip
Merge branch 'martin-datacite-duplicated-author-gh-59' into 'master'
datacite: address duplicated contributor issue See merge request webgroup/fatcat!65
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/datacite.py66
1 files changed, 60 insertions, 6 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 785107ee..ebb29feb 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -294,7 +294,39 @@ class DataciteImporter(EntityImporter):
creators = attributes.get('creators', []) or []
contributors = attributes.get('contributors', []) or [] # Much fewer than creators.
- contribs = self.parse_datacite_creators(creators, doi=doi) + self.parse_datacite_creators(contributors, role=None, set_index=False, doi=doi)
+ contribs = self.parse_datacite_creators(creators, doi=doi)
+
+ # Beside creators, we have contributors in datacite. Sample:
+ # ContactPerson, DataCollector, DataCurator, DataManager, Distributor,
+ # Editor, Funder, HostingInstitution, Other, Producer, ProjectLeader,
+ # ProjectMember, RelatedPerson, ResearchGroup, Researcher,
+ # RightsHolder, Sponsor, Supervisor
+ #
+ # Datacite schema:
+ # https://schema.datacite.org/meta/kernel-4.3/doc/DataCite-MetadataKernel_v4.3.pdf#page=32
+ # -- could be used as a form of controlled vocab?
+ #
+ # Currently (07/2020) in release_contrib:
+ #
+ # select count(*), role from release_contrib group by role;
+ # count | role
+ # -----------+------------
+ # 500269665 | author
+ # 4386563 | editor
+ # 17871 | translator
+ # 10870584 |
+ # (4 rows)
+ #
+ # Related: https://guide.fatcat.wiki/entity_release.html -- role
+ # (string, of a set): the type of contribution, from a controlled
+ # vocabulary. TODO: vocabulary needs review.
+ contribs_extra_contributors = self.parse_datacite_creators(contributors, set_index=False, doi=doi)
+
+ # Unfortunately, creators and contributors might overlap, refs GH59.
+ for cc in contribs_extra_contributors:
+ if contributor_list_contains_contributor(contribs, cc):
+ continue
+ contribs.append(cc)
# Title, may come with "attributes.titles[].titleType", like
# "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"
@@ -725,9 +757,10 @@ class DataciteImporter(EntityImporter):
# Names, that should be ignored right away.
name_blacklist = set(('Occdownload Gbif.Org',))
- for i, c in enumerate(creators):
+ i = 0
+ for c in creators:
if not set_index:
- i = None
+ i = None
nameType = c.get('nameType', '') or ''
if nameType in ('', 'Personal'):
creator_id = None
@@ -799,8 +832,7 @@ class DataciteImporter(EntityImporter):
if contributorType:
extra = {'type': contributorType}
- contribs.append(
- fatcat_openapi_client.ReleaseContrib(
+ rc = fatcat_openapi_client.ReleaseContrib(
creator_id=creator_id,
index=i,
raw_name=name,
@@ -809,7 +841,12 @@ class DataciteImporter(EntityImporter):
role=role,
raw_affiliation=raw_affiliation,
extra=extra,
- ))
+ )
+ # Filter out duplicates early.
+ if not contributor_list_contains_contributor(contribs, rc):
+ contribs.append(rc)
+ if i is not None:
+ i += 1
elif nameType == 'Organizational':
name = c.get('name', '') or ''
if name in UNKNOWN_MARKERS:
@@ -819,12 +856,29 @@ class DataciteImporter(EntityImporter):
extra = {'organization': name}
contribs.append(fatcat_openapi_client.ReleaseContrib(
index=i, extra=extra))
+ if i is not None:
+ i += 1
else:
print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr)
return contribs
+def contributor_list_contains_contributor(contributor_list, contributor):
+ """
+ Given a list of contributors, determine, whether contrib is in that list.
+ """
+ for cc in contributor_list:
+ if cc.raw_name != contributor.raw_name:
+ continue
+ cc_role = cc.role or 'author'
+ contributor_role = contributor.role or 'author'
+ if cc_role != contributor_role:
+ continue
+ return True
+ return False
+
+
def lookup_license_slug(raw):
"""
Resolve a variety of strings into a some pseudo-canonical form, e.g.