aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-01-09 02:08:08 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-01-09 02:08:08 +0100
commit8eab14ac12b17f4965ec5569853bf885ca6aacd2 (patch)
tree5f970575b52390e0f8362e7494306bc9890ee514
parent24ac1dd2f0783583881320ef5fde6540d8530467 (diff)
downloadfatcat-8eab14ac12b17f4965ec5569853bf885ca6aacd2.tar.gz
fatcat-8eab14ac12b17f4965ec5569853bf885ca6aacd2.zip
datacite: add 'Unknown' to blacklist
-rw-r--r--python/fatcat_tools/importers/datacite.py6
-rw-r--r--python/tests/files/datacite/datacite_result_10.json8
2 files changed, 6 insertions, 8 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 7878ebfa..ed8b0906 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -148,8 +148,12 @@ UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(set((
'NN',
'n.a.',
'[s.n.]',
+ 'Unknown',
)))
+# UNKNOWN_MARKERS_LOWER are lowercase version of UNKNOWN blacklist.
+UNKNOWN_MARKERS_LOWER = set((v.lower() for v in UNKNOWN_MARKERS))
+
# TODO(martin): merge this with other maps, maybe.
LICENSE_SLUG_MAP = {
"//creativecommons.org/licenses/by/2.0/": "CC-BY",
@@ -736,7 +740,7 @@ class DataciteImporter(EntityImporter):
continue
if name in name_blacklist:
continue
- if name.lower() in UNKNOWN_MARKERS:
+ if name.lower() in UNKNOWN_MARKERS_LOWER:
continue
# Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'.
if name:
diff --git a/python/tests/files/datacite/datacite_result_10.json b/python/tests/files/datacite/datacite_result_10.json
index 1bb70be6..ed10fe01 100644
--- a/python/tests/files/datacite/datacite_result_10.json
+++ b/python/tests/files/datacite/datacite_result_10.json
@@ -1,12 +1,6 @@
{
"abstracts": [],
- "contribs": [
- {
- "index": 0,
- "raw_name": "Unknown",
- "role": "author"
- }
- ],
+ "contribs": [],
"ext_ids": {
"doi": "10.25549/wpacards-m6171"
},