From 867d71bbfd89be44fedf5955a60cb1e8b774b390 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 10 Jan 2020 16:10:05 -0800 Subject: handle SPNv2-then-CDX lookup failures - use a 10 second delay if CDX result isn't immediately available. blech. - if there is a lookup failure, call it a wayback-error and move on --- python/sandcrawler/ia.py | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 73f8484..0ac1320 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -718,12 +718,29 @@ class SavePageNowClient: print("Failed pdf.sciencedirectassets.com hack!", file=sys.stderr) #print(elsevier_pdf_cdx, file=sys.stderr) - # fetch exact CDX row - cdx_row = wayback_client.cdx_client.fetch( - url=spn_result.terminal_url, - datetime=spn_result.terminal_dt, - filter_status_code=200, - ) + if not cdx_row: + # lookup exact + try: + cdx_row = wayback_client.cdx_client.fetch( + url=spn_result.terminal_url, + datetime=spn_result.terminal_dt, + filter_status_code=200, + retry_sleep=10.0, + ) + except KeyError as ke: + print(str(ke), file=sys.stderr) + return ResourceResult( + start_url=start_url, + hit=False, + status="spn2-cdx-lookup-failure", + terminal_url=spn_result.terminal_url, + terminal_dt=spn_result.terminal_dt, + terminal_status_code=None, + body=None, + cdx=None, + ) + + #print(cdx_row, file=sys.stderr) if '/' in cdx_row.warc_path: # Usually can't do this kind of direct fetch because CDX result is recent/live -- cgit v1.2.3