summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/fatcat_tools/importers/datacite.py19
-rw-r--r--python/fatcat_web/templates/container_view.html10
-rw-r--r--python/fatcat_web/templates/release_view.html6
-rw-r--r--python/tests/import_datacite.py6
4 files changed, 33 insertions, 8 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index d4ff6784..86740e80 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -154,6 +154,15 @@ UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(set((
# UNKNOWN_MARKERS_LOWER are lowercase version of UNKNOWN blacklist.
UNKNOWN_MARKERS_LOWER = set((v.lower() for v in UNKNOWN_MARKERS))
+# Any "min" number of "tokens" will signal "spam", https://fatcat.wiki/release/rzcpjwukobd4pj36ipla22cnoi
+DATACITE_TITLE_SPAM_WORDGROUPS = [
+ {
+ "tokens": ('full', 'movies', 'movie', 'watch', 'streaming', 'online',
+ 'free', 'hd', 'download', 'english', 'subtitle', 'bluray'),
+ "min": 4,
+ }
+]
+
# TODO(martin): merge this with other maps and lookup functions, eventually.
LICENSE_SLUG_MAP = {
"//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml/": "ADS-UK",
@@ -337,6 +346,16 @@ class DataciteImporter(EntityImporter):
print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr)
return False
+ # check for blacklisted "spam", e.g. "FULL MOVIE"
+ for rule in DATACITE_TITLE_SPAM_WORDGROUPS:
+ seen = set()
+ for token in rule.get("tokens", []):
+ if token in title.lower():
+ seen.add(token)
+ if len(seen) >= rule.get("min"):
+ print("[{}] skipping spammy title: {}".format(doi, obj), file=sys.stderr)
+ return False
+
if not subtitle:
subtitle = None
else:
diff --git a/python/fatcat_web/templates/container_view.html b/python/fatcat_web/templates/container_view.html
index 91430715..32c06a53 100644
--- a/python/fatcat_web/templates/container_view.html
+++ b/python/fatcat_web/templates/container_view.html
@@ -121,10 +121,10 @@
<i class="icon times grey"></i> Not in <a href="https://road.issn.org">ISSN ROAD</a><br>
{% endif %}
- {% if container._es.in_kbart == True %}
- <i class="icon check green"></i> In <a href="https://thekeepers.org/purl/issn/{{ container.issnl }}">Keepers Registery</a><br>
- {% elif container._es.in_kbart == False %}
- <i class="icon times grey"></i> Not in <a href="https://thekeepers.org/journals?query={{ container.issnl }}">Keepers Registry</a><br>
+ {% if container._es.any_kbart == True %}
+ <i class="icon check green"></i> In <a href="https://keepers.issn.org/?q=api/search&search[]=MUST=allissn={{ container.issnl }}&search[]=MUST_EXIST=keepers">Keepers Registery</a><br>
+ {% elif container._es.any_kbart == False %}
+ <i class="icon times grey"></i> Not in <a href="https://keepers.issn.org/?q=api/search&search[]=MUST=allissn={{ container.issnl }}&search[]=MUST_EXIST=keepers">Keepers Registry</a><br>
{% endif %}
{% if container.extra and container.extra.sherpa_romeo and container.extra.sherpa_romeo.color %}
@@ -138,7 +138,7 @@
<div class="content">
{% if container.issnl %}
<a href="https://portal.issn.org/resource/issn/{{ container.issnl }}">ISSN Portal</a><br>
- <a href="https://thekeepers.org/purl/issn/{{ container.issnl }}">The Keepers Registry</a> (preservation)<br>
+ <a href="https://keepers.issn.org/?q=api/search&search[]=MUST=allissn={{ container.issnl }}&search[]=MUST_EXIST=keepers">The Keepers Registry</a> (preservation)<br>
<a href="http://www.sherpa.ac.uk/romeo/issn/{{ container.issnl }}/">SHERPA/RoMEO</a> (access policies)<br>
{% endif %}
{% if not container.wikidata_qid %}
diff --git a/python/fatcat_web/templates/release_view.html b/python/fatcat_web/templates/release_view.html
index f907dab8..54ead411 100644
--- a/python/fatcat_web/templates/release_view.html
+++ b/python/fatcat_web/templates/release_view.html
@@ -370,9 +370,9 @@ accessible version.
{% elif release.container._es.in_road == False %}
<i class="icon times grey"></i> Not in <a href="https://road.issn.org">ISSN ROAD</a><br>
{% endif %}
- {% if release.container._es.in_kbart == True %}
- <i class="icon check green"></i> In <a href="https://thekeepers.org/purl/issn/{{ release.container.issnl }}">Keepers Registery</a><br>
- {% elif release.container._es.in_kbart == False %} <i class="icon times grey"></i> Not in <a href="https://thekeepers.org/journals?query={{ release.container.issnl }}">Keepers Registry</a><br>
+ {% if release.container._es.any_kbart == True %}
+ <i class="icon check green"></i> In <a href="https://keepers.issn.org/?q=api/search&search[]=MUST=allissn={{ release.container.issnl }}&search[]=MUST_EXIST=keepers">Keepers Registery</a><br>
+ {% elif release.container._es.any_kbart == False %} <i class="icon times grey"></i> Not in <a href="https://keepers.issn.org/?q=api/search&search[]=MUST=allissn={{ release.container.issnl }}&search[]=MUST_EXIST=keepers">Keepers Registry</a><br>
{% endif %}
{% endif %}
{% if release.container.issnl != None %}
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index ba09ba74..6bc0e7b8 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -318,6 +318,12 @@ def test_parse_datacite_dates():
result = parse_datacite_dates(case.input)
assert result == case.result, case.about
+def test_datacite_spammy_title(datacite_importer):
+ r = datacite_importer.parse_record({"title": """HD! My Hero academia
+ Heroes: Rising [2020]Full Movie Watch
+ Online And Free Download""",
+ "attributes": {"doi": "10.1234/1234"}})
+ assert r == False
def test_datacite_importer(datacite_importer):
last_index = datacite_importer.api.get_changelog(limit=1)[0].index