From fcc6f24a95a7b77bda4ec813daecc2b737a82412 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 7 Jul 2020 02:08:26 +0200 Subject: datacite: address duplicated contributor issue Use string comparison. * https://fatcat.wiki/release/spjysmrnsrgyzgq6ise5o44rlu/contribs * https://api.datacite.org/dois/10.25940/roper-31098406 --- python/fatcat_tools/importers/datacite.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 434a2941..66ec2023 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -298,6 +298,9 @@ class DataciteImporter(EntityImporter): contribs = self.parse_datacite_creators(creators, doi=doi) + self.parse_datacite_creators(contributors, role=None, set_index=False, doi=doi) + # Address duplicated author names; use raw_name string comparison; refs #59. + contribs = unique_contributors(contribs) + # Title, may come with "attributes.titles[].titleType", like # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle" titles = attributes.get('titles', []) or [] @@ -823,6 +826,19 @@ class DataciteImporter(EntityImporter): return contribs +def unique_contributors(contribs): + """ + Given a list of ReleaseContrib items, return a list of unique + ReleaseContribs, refs GH #59. + """ + unique_names, unique_contribs = set(), [] + for rc in contribs: + if rc.raw_name and rc.raw_name in unique_names: + continue + unique_names.add(rc.raw_name) + unique_contribs.append(rc) + return unique_contribs + def lookup_license_slug(raw): """ Resolve a variety of strings into a some pseudo-canonical form, e.g. -- cgit v1.2.3