aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-10-17 17:09:57 -0700
committerBryan Newbold <bnewbold@archive.org>2020-10-17 17:09:57 -0700
commit4f5c66fd33263a146609efbfcbfde4a344f0ac2d (patch)
treeb05481f4e31182e2609b2f1da77a561c3b87b0f1
parent36b7a593f0c65d6dc9b15a7d0e87b345decdbff4 (diff)
downloadsandcrawler-4f5c66fd33263a146609efbfcbfde4a344f0ac2d.tar.gz
sandcrawler-4f5c66fd33263a146609efbfcbfde4a344f0ac2d.zip
ingest: handle cookieAbsent and partial SPNv2 URL reponse cases better
-rw-r--r--python/sandcrawler/ia.py31
1 files changed, 31 insertions, 0 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 2bc52ce..60e3d9a 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -910,6 +910,9 @@ class SavePageNowClient:
raise SavePageNowBackoffError("SPNv2 user-session-limit")
elif status.startswith("error:"):
status = "spn2-" + status
+ # despite other errors, call these a failure (so we don't retry)
+ if spn_result.terminal_url and (spn_result.terminal_url.endswith('/cookieAbsent') or spn_result.terminal_url.endswith("cookieSet=1")):
+ status = "blocked-cookie"
return ResourceResult(
start_url=start_url,
hit=False,
@@ -923,6 +926,34 @@ class SavePageNowClient:
)
#print(spn_result, file=sys.stderr)
+ # detect partial URL response (aka, missing full URL)
+ if spn_result.terminal_url.startswith('/'):
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="spn2-success-partial-url",
+ terminal_url=spn_result.terminal_url,
+ terminal_dt=spn_result.terminal_dt,
+ terminal_status_code=None,
+ body=None,
+ cdx=None,
+ revisit_cdx=None,
+ )
+
+ # don't try to CDX fetch for this common cookie block terminal
+ if spn_result.terminal_url.endswith('/cookieAbsent') or spn_result.terminal_url.endswith("cookieSet=1"):
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="blocked-cookie",
+ terminal_url=spn_result.terminal_url,
+ terminal_dt=spn_result.terminal_dt,
+ terminal_status_code=None,
+ body=None,
+ cdx=None,
+ revisit_cdx=None,
+ )
+
cdx_row = None
# hack to work around elsevier weirdness
if "://pdf.sciencedirectassets.com/" in spn_result.request_url: