aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-10 16:10:05 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-10 16:10:07 -0800
commit867d71bbfd89be44fedf5955a60cb1e8b774b390 (patch)
treedd16bd1f831fba0eb2d54e3d565195e9471bf15c
parent2536282cc635ce605c0bfd63cf7a9c0e10ef883c (diff)
downloadsandcrawler-867d71bbfd89be44fedf5955a60cb1e8b774b390.tar.gz
sandcrawler-867d71bbfd89be44fedf5955a60cb1e8b774b390.zip
handle SPNv2-then-CDX lookup failures
- use a 10 second delay if CDX result isn't immediately available. blech. - if there is a lookup failure, call it a wayback-error and move on
-rw-r--r--python/sandcrawler/ia.py29
1 files changed, 23 insertions, 6 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 73f8484..0ac1320 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -718,12 +718,29 @@ class SavePageNowClient:
print("Failed pdf.sciencedirectassets.com hack!", file=sys.stderr)
#print(elsevier_pdf_cdx, file=sys.stderr)
- # fetch exact CDX row
- cdx_row = wayback_client.cdx_client.fetch(
- url=spn_result.terminal_url,
- datetime=spn_result.terminal_dt,
- filter_status_code=200,
- )
+ if not cdx_row:
+ # lookup exact
+ try:
+ cdx_row = wayback_client.cdx_client.fetch(
+ url=spn_result.terminal_url,
+ datetime=spn_result.terminal_dt,
+ filter_status_code=200,
+ retry_sleep=10.0,
+ )
+ except KeyError as ke:
+ print(str(ke), file=sys.stderr)
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="spn2-cdx-lookup-failure",
+ terminal_url=spn_result.terminal_url,
+ terminal_dt=spn_result.terminal_dt,
+ terminal_status_code=None,
+ body=None,
+ cdx=None,
+ )
+
+ #print(cdx_row, file=sys.stderr)
if '/' in cdx_row.warc_path:
# Usually can't do this kind of direct fetch because CDX result is recent/live