aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-08-11 17:16:39 -0700
committerBryan Newbold <bnewbold@archive.org>2020-08-11 17:16:39 -0700
commitd5f0602e80847adf3d359a7fd06cc131c07cb6dd (patch)
treefa89dadd5f19d7a1c26069254748f5142f5fce06 /python
parent5c7f9bc60b372006adac8e47ee2f4f1f73b84897 (diff)
downloadsandcrawler-d5f0602e80847adf3d359a7fd06cc131c07cb6dd.tar.gz
sandcrawler-d5f0602e80847adf3d359a7fd06cc131c07cb6dd.zip
ingest: check for URL blocklist and cookie URL patterns on every hop
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ingest.py13
1 files changed, 13 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 263b9d5..1f693dc 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -323,6 +323,19 @@ class IngestFileWorker(SandcrawlerWorker):
while len(hops) <= self.max_hops:
result['hops'] = hops
+
+ # check against blocklist again on each hop
+ for block in self.base_url_blocklist:
+ if block in next_url:
+ result['status'] = "skip-url-blocklist"
+ return result
+
+ # check for popular cookie blocking URL patterns. On successful SPN
+ # crawls, shouldn't see these redirect URLs
+ if '/cookieAbsent' in next_url or 'cookieSet=1' in next_url:
+ result['status'] = 'blocked-cookie'
+ return result
+
try:
resource = self.find_resource(next_url, best_mimetype, force_recrawl=force_recrawl)
except SavePageNowError as e: