From 660fe9eeb3ec2bd0f4ae8c9c62932098c1a7b625 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 3 Sep 2021 12:21:41 -0700 Subject: refactor and expand wall/block/cookie URL patterns --- python/sandcrawler/ingest.py | 31 +++++++++++++++++++++++++------ python/tests/test_ingest.py | 14 ++++++++++++++ 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 3fa34e3..fa60e27 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -95,9 +95,13 @@ class IngestFileWorker(SandcrawlerWorker): "://archive.org/", "://www.archive.org/", "://web.archive.org/web/", + + # out of scope "://openlibrary.org/", "://www.openlibrary.org/", "://fatcat.wiki/", + "://orcid.org/", + "://doaj.org/", # Domain squats "://bartandjones.com", @@ -123,6 +127,9 @@ class IngestFileWorker(SandcrawlerWorker): # DOI prefixes "://doi.org/10.2307/", # JSTOR; slow and many redirects + + # deprecated domain (doesn't redirect correctly) + "://edoc.mpg.de/", ] self.wall_blocklist = [ @@ -131,6 +138,14 @@ class IngestFileWorker(SandcrawlerWorker): "://login.bepress.com/", "?SAMLRequest=", "://osapublishing.org/captcha/", + "/password-login", + "://gateway.isiknowledge.com/", + ] + + self.cookie_blocklist = [ + "/cookieAbsent", + "cookieSet=1", + "error=cookies_not_supported", ] # these are special-case web domains for which we want SPN2 to not run @@ -518,14 +533,16 @@ class IngestFileWorker(SandcrawlerWorker): # check against known loginwall URLs for block in self.wall_blocklist: if block in next_url: + # TODO: blocked-wall instead of skip-wall result['status'] = "skip-wall" return result # check for popular cookie blocking URL patterns. On successful SPN # crawls, shouldn't see these redirect URLs - if '/cookieAbsent' in next_url or 'cookieSet=1' in next_url or 'error=cookies_not_supported' in next_url: - result['status'] = 'blocked-cookie' - return result + for pattern in self.cookie_blocklist: + if pattern in next_url: + result['status'] = 'blocked-cookie' + return result try: resource = self.find_resource(next_url, best_mimetype, force_recrawl=force_recrawl) @@ -571,9 +588,11 @@ class IngestFileWorker(SandcrawlerWorker): result['status'] = resource.status return result - if resource.terminal_url and ('/cookieAbsent' in resource.terminal_url or 'cookieSet=1' in resource.terminal_url): - result['status'] = 'blocked-cookie' - return result + if resource.terminal_url: + for pattern in self.cookie_blocklist: + if pattern in resource.terminal_url: + result['status'] = 'blocked-cookie' + return result if not resource.body: result['status'] = 'null-body' diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py index 46346b7..b51f721 100644 --- a/python/tests/test_ingest.py +++ b/python/tests/test_ingest.py @@ -191,3 +191,17 @@ def test_ingest_wall_blocklist(ingest_worker): assert resp['status'] == "skip-wall" assert resp['request'] == request +@responses.activate +def test_ingest_cookie_blocklist(ingest_worker): + + request = { + 'ingest_type': 'pdf', + 'base_url': "https://test.fatcat.wiki/cookieAbsent", + } + + resp = ingest_worker.process(request) + + assert resp['hit'] == False + assert resp['status'] == "blocked-cookie" + assert resp['request'] == request + -- cgit v1.2.3