ingest: add a check for blocked-cookie before trying PDF url extraction

author: Bryan Newbold <bnewbold@archive.org> 2020-10-21 12:22:30 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-10-21 12:22:30 -0700
commit: 86cc15d9c2e1f2e857d0dcf141dd5ea4d720dff5 (patch)
tree: f2eccc61f14b9159f7656e873b288ef2bbf38db7
parent: 200bf734bd459dd3c7a147b3dfe127dbf0ed7f70 (diff)
download: sandcrawler-86cc15d9c2e1f2e857d0dcf141dd5ea4d720dff5.tar.gz
sandcrawler-86cc15d9c2e1f2e857d0dcf141dd5ea4d720dff5.zip
1 files changed, 11 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 57988e8..ef4ca25 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -414,6 +414,17 @@ class IngestFileWorker(SandcrawlerWorker):
                 return result
             file_meta = gen_file_metadata(resource.body)
 
+            if resource.terminal_url and ('/cookieAbsent' in next_url or 'cookieSet=1' in resource.terminal_url):
+                result['status'] = 'blocked-cookie'
+                result['terminal'] = {
+                    "terminal_url": resource.terminal_url,
+                    "terminal_dt": resource.terminal_dt,
+                    "terminal_status_code": resource.terminal_status_code,
+                }
+                if resource.terminal_url not in result['hops']:
+                    result['hops'].append(resource.terminal_url)
+                return result
+
             # crude handling of content-encoding; wayback fetch library usually
             # (and should always?) handle this
             if file_meta['mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip':
author	Bryan Newbold <bnewbold@archive.org>	2020-10-21 12:22:30 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-10-21 12:22:30 -0700
commit	86cc15d9c2e1f2e857d0dcf141dd5ea4d720dff5 (patch)
tree	f2eccc61f14b9159f7656e873b288ef2bbf38db7
parent	200bf734bd459dd3c7a147b3dfe127dbf0ed7f70 (diff)
download	sandcrawler-86cc15d9c2e1f2e857d0dcf141dd5ea4d720dff5.tar.gz sandcrawler-86cc15d9c2e1f2e857d0dcf141dd5ea4d720dff5.zip