summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/datacite.py
diff options
context:
space:
mode:
authorMartin Czygan <martin@archive.org>2020-09-22 23:28:48 +0000
committerMartin Czygan <martin@archive.org>2020-09-22 23:28:48 +0000
commitbca7d905b377e5e8cd04666a29eef2dd391dfa15 (patch)
treead89a3c8f56620d341cc35aecbca44f051443d0d /python/fatcat_tools/importers/datacite.py
parent5c879f7f147b11d1013772b2f9a74442984f58fc (diff)
parent6589bffcde0ece37f4976553407ee849af7f28b5 (diff)
downloadfatcat-bca7d905b377e5e8cd04666a29eef2dd391dfa15.tar.gz
fatcat-bca7d905b377e5e8cd04666a29eef2dd391dfa15.zip
Merge branch 'martin-datacite-spammy-title' into 'master'
address spammy datacite titles See merge request webgroup/fatcat!85
Diffstat (limited to 'python/fatcat_tools/importers/datacite.py')
-rw-r--r--python/fatcat_tools/importers/datacite.py19
1 files changed, 19 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index d4ff6784..86740e80 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -154,6 +154,15 @@ UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(set((
# UNKNOWN_MARKERS_LOWER are lowercase version of UNKNOWN blacklist.
UNKNOWN_MARKERS_LOWER = set((v.lower() for v in UNKNOWN_MARKERS))
+# Any "min" number of "tokens" will signal "spam", https://fatcat.wiki/release/rzcpjwukobd4pj36ipla22cnoi
+DATACITE_TITLE_SPAM_WORDGROUPS = [
+ {
+ "tokens": ('full', 'movies', 'movie', 'watch', 'streaming', 'online',
+ 'free', 'hd', 'download', 'english', 'subtitle', 'bluray'),
+ "min": 4,
+ }
+]
+
# TODO(martin): merge this with other maps and lookup functions, eventually.
LICENSE_SLUG_MAP = {
"//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml/": "ADS-UK",
@@ -337,6 +346,16 @@ class DataciteImporter(EntityImporter):
print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr)
return False
+ # check for blacklisted "spam", e.g. "FULL MOVIE"
+ for rule in DATACITE_TITLE_SPAM_WORDGROUPS:
+ seen = set()
+ for token in rule.get("tokens", []):
+ if token in title.lower():
+ seen.add(token)
+ if len(seen) >= rule.get("min"):
+ print("[{}] skipping spammy title: {}".format(doi, obj), file=sys.stderr)
+ return False
+
if not subtitle:
subtitle = None
else: