diff options
author | bnewbold <bnewbold@archive.org> | 2020-07-11 00:31:47 +0000 |
---|---|---|
committer | bnewbold <bnewbold@archive.org> | 2020-07-11 00:31:47 +0000 |
commit | f5aefab6a6431ab9db99761457fd47b36b920b8c (patch) | |
tree | d144988d310aeecf8521cfc33aca9f0667dfedbc /python/fatcat_tools/importers/datacite.py | |
parent | 26b455ffad566bef58684a78654a2719c409588a (diff) | |
parent | 3c266e07771271241aa8cff3e3199a45109362af (diff) | |
download | fatcat-f5aefab6a6431ab9db99761457fd47b36b920b8c.tar.gz fatcat-f5aefab6a6431ab9db99761457fd47b36b920b8c.zip |
Merge branch 'martin-datacite-duplicated-author-gh-59' into 'master'
datacite: address duplicated contributor issue
See merge request webgroup/fatcat!65
Diffstat (limited to 'python/fatcat_tools/importers/datacite.py')
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 66 |
1 files changed, 60 insertions, 6 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 785107ee..ebb29feb 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -294,7 +294,39 @@ class DataciteImporter(EntityImporter): creators = attributes.get('creators', []) or [] contributors = attributes.get('contributors', []) or [] # Much fewer than creators. - contribs = self.parse_datacite_creators(creators, doi=doi) + self.parse_datacite_creators(contributors, role=None, set_index=False, doi=doi) + contribs = self.parse_datacite_creators(creators, doi=doi) + + # Beside creators, we have contributors in datacite. Sample: + # ContactPerson, DataCollector, DataCurator, DataManager, Distributor, + # Editor, Funder, HostingInstitution, Other, Producer, ProjectLeader, + # ProjectMember, RelatedPerson, ResearchGroup, Researcher, + # RightsHolder, Sponsor, Supervisor + # + # Datacite schema: + # https://schema.datacite.org/meta/kernel-4.3/doc/DataCite-MetadataKernel_v4.3.pdf#page=32 + # -- could be used as a form of controlled vocab? + # + # Currently (07/2020) in release_contrib: + # + # select count(*), role from release_contrib group by role; + # count | role + # -----------+------------ + # 500269665 | author + # 4386563 | editor + # 17871 | translator + # 10870584 | + # (4 rows) + # + # Related: https://guide.fatcat.wiki/entity_release.html -- role + # (string, of a set): the type of contribution, from a controlled + # vocabulary. TODO: vocabulary needs review. + contribs_extra_contributors = self.parse_datacite_creators(contributors, set_index=False, doi=doi) + + # Unfortunately, creators and contributors might overlap, refs GH59. + for cc in contribs_extra_contributors: + if contributor_list_contains_contributor(contribs, cc): + continue + contribs.append(cc) # Title, may come with "attributes.titles[].titleType", like # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle" @@ -725,9 +757,10 @@ class DataciteImporter(EntityImporter): # Names, that should be ignored right away. name_blacklist = set(('Occdownload Gbif.Org',)) - for i, c in enumerate(creators): + i = 0 + for c in creators: if not set_index: - i = None + i = None nameType = c.get('nameType', '') or '' if nameType in ('', 'Personal'): creator_id = None @@ -799,8 +832,7 @@ class DataciteImporter(EntityImporter): if contributorType: extra = {'type': contributorType} - contribs.append( - fatcat_openapi_client.ReleaseContrib( + rc = fatcat_openapi_client.ReleaseContrib( creator_id=creator_id, index=i, raw_name=name, @@ -809,7 +841,12 @@ class DataciteImporter(EntityImporter): role=role, raw_affiliation=raw_affiliation, extra=extra, - )) + ) + # Filter out duplicates early. + if not contributor_list_contains_contributor(contribs, rc): + contribs.append(rc) + if i is not None: + i += 1 elif nameType == 'Organizational': name = c.get('name', '') or '' if name in UNKNOWN_MARKERS: @@ -819,12 +856,29 @@ class DataciteImporter(EntityImporter): extra = {'organization': name} contribs.append(fatcat_openapi_client.ReleaseContrib( index=i, extra=extra)) + if i is not None: + i += 1 else: print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr) return contribs +def contributor_list_contains_contributor(contributor_list, contributor): + """ + Given a list of contributors, determine, whether contrib is in that list. + """ + for cc in contributor_list: + if cc.raw_name != contributor.raw_name: + continue + cc_role = cc.role or 'author' + contributor_role = contributor.role or 'author' + if cc_role != contributor_role: + continue + return True + return False + + def lookup_license_slug(raw): """ Resolve a variety of strings into a some pseudo-canonical form, e.g. |