diff options
-rw-r--r-- | python/sandcrawler/ia.py | 31 |
1 files changed, 31 insertions, 0 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 2bc52ce..60e3d9a 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -910,6 +910,9 @@ class SavePageNowClient: raise SavePageNowBackoffError("SPNv2 user-session-limit") elif status.startswith("error:"): status = "spn2-" + status + # despite other errors, call these a failure (so we don't retry) + if spn_result.terminal_url and (spn_result.terminal_url.endswith('/cookieAbsent') or spn_result.terminal_url.endswith("cookieSet=1")): + status = "blocked-cookie" return ResourceResult( start_url=start_url, hit=False, @@ -923,6 +926,34 @@ class SavePageNowClient: ) #print(spn_result, file=sys.stderr) + # detect partial URL response (aka, missing full URL) + if spn_result.terminal_url.startswith('/'): + return ResourceResult( + start_url=start_url, + hit=False, + status="spn2-success-partial-url", + terminal_url=spn_result.terminal_url, + terminal_dt=spn_result.terminal_dt, + terminal_status_code=None, + body=None, + cdx=None, + revisit_cdx=None, + ) + + # don't try to CDX fetch for this common cookie block terminal + if spn_result.terminal_url.endswith('/cookieAbsent') or spn_result.terminal_url.endswith("cookieSet=1"): + return ResourceResult( + start_url=start_url, + hit=False, + status="blocked-cookie", + terminal_url=spn_result.terminal_url, + terminal_dt=spn_result.terminal_dt, + terminal_status_code=None, + body=None, + cdx=None, + revisit_cdx=None, + ) + cdx_row = None # hack to work around elsevier weirdness if "://pdf.sciencedirectassets.com/" in spn_result.request_url: |