From 4f5c66fd33263a146609efbfcbfde4a344f0ac2d Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sat, 17 Oct 2020 17:09:57 -0700 Subject: ingest: handle cookieAbsent and partial SPNv2 URL reponse cases better --- python/sandcrawler/ia.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'python/sandcrawler/ia.py') diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 2bc52ce..60e3d9a 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -910,6 +910,9 @@ class SavePageNowClient: raise SavePageNowBackoffError("SPNv2 user-session-limit") elif status.startswith("error:"): status = "spn2-" + status + # despite other errors, call these a failure (so we don't retry) + if spn_result.terminal_url and (spn_result.terminal_url.endswith('/cookieAbsent') or spn_result.terminal_url.endswith("cookieSet=1")): + status = "blocked-cookie" return ResourceResult( start_url=start_url, hit=False, @@ -923,6 +926,34 @@ class SavePageNowClient: ) #print(spn_result, file=sys.stderr) + # detect partial URL response (aka, missing full URL) + if spn_result.terminal_url.startswith('/'): + return ResourceResult( + start_url=start_url, + hit=False, + status="spn2-success-partial-url", + terminal_url=spn_result.terminal_url, + terminal_dt=spn_result.terminal_dt, + terminal_status_code=None, + body=None, + cdx=None, + revisit_cdx=None, + ) + + # don't try to CDX fetch for this common cookie block terminal + if spn_result.terminal_url.endswith('/cookieAbsent') or spn_result.terminal_url.endswith("cookieSet=1"): + return ResourceResult( + start_url=start_url, + hit=False, + status="blocked-cookie", + terminal_url=spn_result.terminal_url, + terminal_dt=spn_result.terminal_dt, + terminal_status_code=None, + body=None, + cdx=None, + revisit_cdx=None, + ) + cdx_row = None # hack to work around elsevier weirdness if "://pdf.sciencedirectassets.com/" in spn_result.request_url: -- cgit v1.2.3