diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-10-21 12:22:30 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-10-21 12:22:30 -0700 |
commit | 86cc15d9c2e1f2e857d0dcf141dd5ea4d720dff5 (patch) | |
tree | f2eccc61f14b9159f7656e873b288ef2bbf38db7 | |
parent | 200bf734bd459dd3c7a147b3dfe127dbf0ed7f70 (diff) | |
download | sandcrawler-86cc15d9c2e1f2e857d0dcf141dd5ea4d720dff5.tar.gz sandcrawler-86cc15d9c2e1f2e857d0dcf141dd5ea4d720dff5.zip |
ingest: add a check for blocked-cookie before trying PDF url extraction
-rw-r--r-- | python/sandcrawler/ingest.py | 11 |
1 files changed, 11 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 57988e8..ef4ca25 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -414,6 +414,17 @@ class IngestFileWorker(SandcrawlerWorker): return result file_meta = gen_file_metadata(resource.body) + if resource.terminal_url and ('/cookieAbsent' in next_url or 'cookieSet=1' in resource.terminal_url): + result['status'] = 'blocked-cookie' + result['terminal'] = { + "terminal_url": resource.terminal_url, + "terminal_dt": resource.terminal_dt, + "terminal_status_code": resource.terminal_status_code, + } + if resource.terminal_url not in result['hops']: + result['hops'].append(resource.terminal_url) + return result + # crude handling of content-encoding; wayback fetch library usually # (and should always?) handle this if file_meta['mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip': |