aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/datacite.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-03 14:01:33 -0700
committerBryan Newbold <bnewbold@robocracy.org>2021-11-03 16:46:07 -0700
commit36cedfde374a2643396b070d3116e4b568500e14 (patch)
treeb199868b325897ea5dc2b065192a7eba2daf9c6b /python/fatcat_tools/importers/datacite.py
parent5d29d1336afc90d3575a0379a9e9d9bdac8d1856 (diff)
downloadfatcat-36cedfde374a2643396b070d3116e4b568500e14.tar.gz
fatcat-36cedfde374a2643396b070d3116e4b568500e14.zip
more involved type wrangling and fixes for importers
Diffstat (limited to 'python/fatcat_tools/importers/datacite.py')
-rw-r--r--python/fatcat_tools/importers/datacite.py5
1 files changed, 3 insertions, 2 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 997f8dc8..6eed8991 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -376,10 +376,11 @@ class DataciteImporter(EntityImporter):
# check for blocklisted "spam", e.g. "FULL MOVIE"
for rule in DATACITE_TITLE_SPAM_WORDGROUPS:
seen = set()
- for token in rule.get("tokens", []):
+ token_list: List[str] = rule.get("tokens") or []
+ for token in token_list:
if token in title.lower():
seen.add(token)
- if len(seen) >= rule.get("min"):
+ if len(seen) >= rule["min"]:
print("[{}] skipping spammy title: {}".format(doi, obj), file=sys.stderr)
return False