diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-03 14:01:33 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-03 16:46:07 -0700 |
commit | 36cedfde374a2643396b070d3116e4b568500e14 (patch) | |
tree | b199868b325897ea5dc2b065192a7eba2daf9c6b /python/fatcat_tools/importers/datacite.py | |
parent | 5d29d1336afc90d3575a0379a9e9d9bdac8d1856 (diff) | |
download | fatcat-36cedfde374a2643396b070d3116e4b568500e14.tar.gz fatcat-36cedfde374a2643396b070d3116e4b568500e14.zip |
more involved type wrangling and fixes for importers
Diffstat (limited to 'python/fatcat_tools/importers/datacite.py')
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 5 |
1 files changed, 3 insertions, 2 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 997f8dc8..6eed8991 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -376,10 +376,11 @@ class DataciteImporter(EntityImporter): # check for blocklisted "spam", e.g. "FULL MOVIE" for rule in DATACITE_TITLE_SPAM_WORDGROUPS: seen = set() - for token in rule.get("tokens", []): + token_list: List[str] = rule.get("tokens") or [] + for token in token_list: if token in title.lower(): seen.add(token) - if len(seen) >= rule.get("min"): + if len(seen) >= rule["min"]: print("[{}] skipping spammy title: {}".format(doi, obj), file=sys.stderr) return False |