diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-09-23 00:45:38 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-09-23 00:45:38 +0200 |
commit | 6589bffcde0ece37f4976553407ee849af7f28b5 (patch) | |
tree | ad89a3c8f56620d341cc35aecbca44f051443d0d | |
parent | 5c879f7f147b11d1013772b2f9a74442984f58fc (diff) | |
download | fatcat-6589bffcde0ece37f4976553407ee849af7f28b5.tar.gz fatcat-6589bffcde0ece37f4976553407ee849af7f28b5.zip |
address spammy datacite titles
seemingly from zenodo:
* https://fatcat.wiki/release/rzcpjwukobd4pj36ipla22cnoi
* https://doi.org/10.5281/zenodo.4041777
About 3400 records with "FULL MOVIE" in title, currently.
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 19 | ||||
-rw-r--r-- | python/tests/import_datacite.py | 6 |
2 files changed, 25 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index d4ff6784..86740e80 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -154,6 +154,15 @@ UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(set(( # UNKNOWN_MARKERS_LOWER are lowercase version of UNKNOWN blacklist. UNKNOWN_MARKERS_LOWER = set((v.lower() for v in UNKNOWN_MARKERS)) +# Any "min" number of "tokens" will signal "spam", https://fatcat.wiki/release/rzcpjwukobd4pj36ipla22cnoi +DATACITE_TITLE_SPAM_WORDGROUPS = [ + { + "tokens": ('full', 'movies', 'movie', 'watch', 'streaming', 'online', + 'free', 'hd', 'download', 'english', 'subtitle', 'bluray'), + "min": 4, + } +] + # TODO(martin): merge this with other maps and lookup functions, eventually. LICENSE_SLUG_MAP = { "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml/": "ADS-UK", @@ -337,6 +346,16 @@ class DataciteImporter(EntityImporter): print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr) return False + # check for blacklisted "spam", e.g. "FULL MOVIE" + for rule in DATACITE_TITLE_SPAM_WORDGROUPS: + seen = set() + for token in rule.get("tokens", []): + if token in title.lower(): + seen.add(token) + if len(seen) >= rule.get("min"): + print("[{}] skipping spammy title: {}".format(doi, obj), file=sys.stderr) + return False + if not subtitle: subtitle = None else: diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index ba09ba74..6bc0e7b8 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -318,6 +318,12 @@ def test_parse_datacite_dates(): result = parse_datacite_dates(case.input) assert result == case.result, case.about +def test_datacite_spammy_title(datacite_importer): + r = datacite_importer.parse_record({"title": """HD! My Hero academia + Heroes: Rising [2020]Full Movie Watch + Online And Free Download""", + "attributes": {"doi": "10.1234/1234"}}) + assert r == False def test_datacite_importer(datacite_importer): last_index = datacite_importer.api.get_changelog(limit=1)[0].index |