address spammy datacite titles

seemingly from zenodo: * https://fatcat.wiki/release/rzcpjwukobd4pj36ipla22cnoi * https://doi.org/10.5281/zenodo.4041777 About 3400 records with "FULL MOVIE" in title, currently.
author: Martin Czygan <martin.czygan@gmail.com> 2020-09-23 00:45:38 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2020-09-23 00:45:38 +0200
commit: 6589bffcde0ece37f4976553407ee849af7f28b5 (patch)
tree: ad89a3c8f56620d341cc35aecbca44f051443d0d /python
parent: 5c879f7f147b11d1013772b2f9a74442984f58fc (diff)
download: fatcat-6589bffcde0ece37f4976553407ee849af7f28b5.tar.gz
fatcat-6589bffcde0ece37f4976553407ee849af7f28b5.zip
2 files changed, 25 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index d4ff6784..86740e80 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -154,6 +154,15 @@ UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(set((
 # UNKNOWN_MARKERS_LOWER are lowercase version of UNKNOWN blacklist.
 UNKNOWN_MARKERS_LOWER = set((v.lower() for v in UNKNOWN_MARKERS))
 
+# Any "min" number of "tokens" will signal "spam", https://fatcat.wiki/release/rzcpjwukobd4pj36ipla22cnoi
+DATACITE_TITLE_SPAM_WORDGROUPS = [
+    {
+        "tokens": ('full', 'movies', 'movie', 'watch', 'streaming', 'online',
+                   'free', 'hd', 'download', 'english', 'subtitle', 'bluray'),
+        "min": 4,
+    }
+]
+
 # TODO(martin): merge this with other maps and lookup functions, eventually.
 LICENSE_SLUG_MAP = {
     "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml/": "ADS-UK",
@@ -337,6 +346,16 @@ class DataciteImporter(EntityImporter):
             print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr)
             return False
 
+        # check for blacklisted "spam", e.g. "FULL MOVIE"
+        for rule in DATACITE_TITLE_SPAM_WORDGROUPS:
+            seen = set()
+            for token in rule.get("tokens", []):
+                if token in title.lower():
+                    seen.add(token)
+            if len(seen) >= rule.get("min"):
+                print("[{}] skipping spammy title: {}".format(doi, obj), file=sys.stderr)
+                return False
+
         if not subtitle:
             subtitle = None
         else:
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index ba09ba74..6bc0e7b8 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -318,6 +318,12 @@ def test_parse_datacite_dates():
         result = parse_datacite_dates(case.input)
         assert result == case.result, case.about
 
+def test_datacite_spammy_title(datacite_importer):
+    r = datacite_importer.parse_record({"title": """HD! My Hero academia
+                                        Heroes: Rising [2020]Full Movie Watch
+                                        Online And Free Download""",
+                                        "attributes": {"doi": "10.1234/1234"}})
+    assert r == False
 
 def test_datacite_importer(datacite_importer):
     last_index = datacite_importer.api.get_changelog(limit=1)[0].index
author	Martin Czygan <martin.czygan@gmail.com>	2020-09-23 00:45:38 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2020-09-23 00:45:38 +0200
commit	6589bffcde0ece37f4976553407ee849af7f28b5 (patch)
tree	ad89a3c8f56620d341cc35aecbca44f051443d0d /python
parent	5c879f7f147b11d1013772b2f9a74442984f58fc (diff)
download	fatcat-6589bffcde0ece37f4976553407ee849af7f28b5.tar.gz fatcat-6589bffcde0ece37f4976553407ee849af7f28b5.zip