diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-10-17 17:09:57 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-10-17 17:09:57 -0700 |
commit | 4f5c66fd33263a146609efbfcbfde4a344f0ac2d (patch) | |
tree | b05481f4e31182e2609b2f1da77a561c3b87b0f1 /python | |
parent | 36b7a593f0c65d6dc9b15a7d0e87b345decdbff4 (diff) | |
download | sandcrawler-4f5c66fd33263a146609efbfcbfde4a344f0ac2d.tar.gz sandcrawler-4f5c66fd33263a146609efbfcbfde4a344f0ac2d.zip |
ingest: handle cookieAbsent and partial SPNv2 URL reponse cases better
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ia.py | 31 |
1 files changed, 31 insertions, 0 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 2bc52ce..60e3d9a 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -910,6 +910,9 @@ class SavePageNowClient: raise SavePageNowBackoffError("SPNv2 user-session-limit") elif status.startswith("error:"): status = "spn2-" + status + # despite other errors, call these a failure (so we don't retry) + if spn_result.terminal_url and (spn_result.terminal_url.endswith('/cookieAbsent') or spn_result.terminal_url.endswith("cookieSet=1")): + status = "blocked-cookie" return ResourceResult( start_url=start_url, hit=False, @@ -923,6 +926,34 @@ class SavePageNowClient: ) #print(spn_result, file=sys.stderr) + # detect partial URL response (aka, missing full URL) + if spn_result.terminal_url.startswith('/'): + return ResourceResult( + start_url=start_url, + hit=False, + status="spn2-success-partial-url", + terminal_url=spn_result.terminal_url, + terminal_dt=spn_result.terminal_dt, + terminal_status_code=None, + body=None, + cdx=None, + revisit_cdx=None, + ) + + # don't try to CDX fetch for this common cookie block terminal + if spn_result.terminal_url.endswith('/cookieAbsent') or spn_result.terminal_url.endswith("cookieSet=1"): + return ResourceResult( + start_url=start_url, + hit=False, + status="blocked-cookie", + terminal_url=spn_result.terminal_url, + terminal_dt=spn_result.terminal_dt, + terminal_status_code=None, + body=None, + cdx=None, + revisit_cdx=None, + ) + cdx_row = None # hack to work around elsevier weirdness if "://pdf.sciencedirectassets.com/" in spn_result.request_url: |