summaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-07-07 02:08:26 +0200
committerMartin Czygan <martin.czygan@gmail.com>2020-07-07 02:08:26 +0200
commitfcc6f24a95a7b77bda4ec813daecc2b737a82412 (patch)
tree23795219ad991387d30a1c72c8b79e5993e254a9 /python
parentca8fa64c1590a43b1e92fd8898275625d083451a (diff)
downloadfatcat-fcc6f24a95a7b77bda4ec813daecc2b737a82412.tar.gz
fatcat-fcc6f24a95a7b77bda4ec813daecc2b737a82412.zip
datacite: address duplicated contributor issue
Use string comparison. * https://fatcat.wiki/release/spjysmrnsrgyzgq6ise5o44rlu/contribs * https://api.datacite.org/dois/10.25940/roper-31098406
Diffstat (limited to 'python')
-rw-r--r--python/fatcat_tools/importers/datacite.py16
-rw-r--r--python/tests/files/datacite/datacite_doc_33.json62
-rw-r--r--python/tests/files/datacite/datacite_result_05.json3
-rw-r--r--python/tests/files/datacite/datacite_result_08.json7
-rw-r--r--python/tests/files/datacite/datacite_result_33.json31
-rw-r--r--python/tests/import_datacite.py2
6 files changed, 110 insertions, 11 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 434a2941..66ec2023 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -298,6 +298,9 @@ class DataciteImporter(EntityImporter):
contribs = self.parse_datacite_creators(creators, doi=doi) + self.parse_datacite_creators(contributors, role=None, set_index=False, doi=doi)
+ # Address duplicated author names; use raw_name string comparison; refs #59.
+ contribs = unique_contributors(contribs)
+
# Title, may come with "attributes.titles[].titleType", like
# "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"
titles = attributes.get('titles', []) or []
@@ -823,6 +826,19 @@ class DataciteImporter(EntityImporter):
return contribs
+def unique_contributors(contribs):
+ """
+ Given a list of ReleaseContrib items, return a list of unique
+ ReleaseContribs, refs GH #59.
+ """
+ unique_names, unique_contribs = set(), []
+ for rc in contribs:
+ if rc.raw_name and rc.raw_name in unique_names:
+ continue
+ unique_names.add(rc.raw_name)
+ unique_contribs.append(rc)
+ return unique_contribs
+
def lookup_license_slug(raw):
"""
Resolve a variety of strings into a some pseudo-canonical form, e.g.
diff --git a/python/tests/files/datacite/datacite_doc_33.json b/python/tests/files/datacite/datacite_doc_33.json
new file mode 100644
index 00000000..571d1220
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_33.json
@@ -0,0 +1,62 @@
+{
+ "id": "10.17912/micropub.biology.000143",
+ "type": "dois",
+ "attributes": {
+ "doi": "10.17912/micropub.biology.000143",
+ "identifiers": null,
+ "creators": [
+ {
+ "name": "ABC News",
+ "givenName": "",
+ "familyName": "",
+ "affiliation": [],
+ "role": "author"
+ }
+ ],
+ "titles": [
+ {
+ "title": "Sample"
+ }
+ ],
+ "publisher": "microPublication Biology",
+ "publicationYear": 2019,
+ "types": {
+ "resourceTypeGeneral": "DataPaper"
+ },
+ "relatedIdentifiers": [],
+ "sizes": [],
+ "formats": [],
+ "version": null,
+ "rightsList": [],
+ "descriptions": [
+ {
+ "description": 1234567890,
+ "descriptionType": "Abstract"
+ }
+ ],
+ "geoLocations": [],
+ "fundingReferences": [],
+ "url": "https://www.micropublication.org/journals/biology/micropub.biology.000143",
+ "created": "2019-08-19T14:43:08.000Z",
+ "registered": "2019-08-19T14:43:09.000Z",
+ "published": "2019",
+ "updated": "2019-11-09T12:32:02.000Z",
+ "contributors": [
+ {
+ "name": "ABC News",
+ "givenName": "",
+ "familyName": "",
+ "affiliation": [],
+ "role": ""
+ }
+ ]
+ },
+ "relationships": {
+ "client": {
+ "data": {
+ "id": "caltech.micropub",
+ "type": "clients"
+ }
+ }
+ }
+}
diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json
index 79c2a8fb..d634490d 100644
--- a/python/tests/files/datacite/datacite_result_05.json
+++ b/python/tests/files/datacite/datacite_result_05.json
@@ -505,9 +505,6 @@
"surname": "Wurzbacher"
},
{
- "raw_name": "Kessy Abarenkov"
- },
- {
"raw_name": "NHM UT-University Of Tartu; Natural History Museum And Botanic Garden"
}
],
diff --git a/python/tests/files/datacite/datacite_result_08.json b/python/tests/files/datacite/datacite_result_08.json
index 70237280..5a46ef50 100644
--- a/python/tests/files/datacite/datacite_result_08.json
+++ b/python/tests/files/datacite/datacite_result_08.json
@@ -13,13 +13,6 @@
"raw_name": "Kei Kajisa",
"role": "author",
"surname": "Kajisa"
- },
- {
- "given_name": "Kei",
- "index": 1,
- "raw_name": "Kei Kajisa",
- "role": "author",
- "surname": "Kajisa"
}
],
"ext_ids": {
diff --git a/python/tests/files/datacite/datacite_result_33.json b/python/tests/files/datacite/datacite_result_33.json
new file mode 100644
index 00000000..bcb72469
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_33.json
@@ -0,0 +1,31 @@
+{
+ "abstracts": [
+ {
+ "content": "1234567890",
+ "mimetype": "text/plain"
+ }
+ ],
+ "contribs": [
+ {
+ "given_name": "",
+ "surname": "",
+ "index": 0,
+ "raw_name": "ABC News",
+ "role": "author"
+ }
+ ],
+ "ext_ids": {
+ "doi": "10.17912/micropub.biology.000143"
+ },
+ "extra": {
+ "datacite": {
+ "resourceTypeGeneral": "DataPaper"
+ },
+ "container_name": "microPublication Biology"
+ },
+ "refs": [],
+ "release_stage": "published",
+ "release_year": 2019,
+ "publisher": "microPublication Biology",
+ "title": "Sample"
+}
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 20c1eaf8..1472b8ea 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -288,7 +288,7 @@ def test_datacite_conversions(datacite_importer):
for now.
"""
datacite_importer.debug = True
- for i in range(33):
+ for i in range(34):
src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i)
dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i)
with open(src, 'r') as f: