aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-07-14 16:38:21 -0700
committerBryan Newbold <bnewbold@archive.org>2021-07-14 16:38:21 -0700
commit0cd655b946ebfbc3ed0bf1a2a1c1d9b94adca135 (patch)
treee14a6f22f8e027b98fb807736608890da312899f
parent23374f2d10914e06c67e7c0c1f9c37ba98e36eeb (diff)
downloadsandcrawler-0cd655b946ebfbc3ed0bf1a2a1c1d9b94adca135.tar.gz
sandcrawler-0cd655b946ebfbc3ed0bf1a2a1c1d9b94adca135.zip
more blocked-cookie patterns; fix old typo
-rw-r--r--python/sandcrawler/ingest.py4
1 files changed, 2 insertions, 2 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index b610ab4..290bebc 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -523,7 +523,7 @@ class IngestFileWorker(SandcrawlerWorker):
# check for popular cookie blocking URL patterns. On successful SPN
# crawls, shouldn't see these redirect URLs
- if '/cookieAbsent' in next_url or 'cookieSet=1' in next_url:
+ if '/cookieAbsent' in next_url or 'cookieSet=1' in next_url or 'error=cookies_not_supported' in next_url:
result['status'] = 'blocked-cookie'
return result
@@ -571,7 +571,7 @@ class IngestFileWorker(SandcrawlerWorker):
result['status'] = resource.status
return result
- if resource.terminal_url and ('/cookieAbsent' in next_url or 'cookieSet=1' in resource.terminal_url):
+ if resource.terminal_url and ('/cookieAbsent' in resource.terminal_url or 'cookieSet=1' in resource.terminal_url):
result['status'] = 'blocked-cookie'
return result