diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-01-10 16:10:05 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-10 16:10:07 -0800 |
commit | 867d71bbfd89be44fedf5955a60cb1e8b774b390 (patch) | |
tree | dd16bd1f831fba0eb2d54e3d565195e9471bf15c | |
parent | 2536282cc635ce605c0bfd63cf7a9c0e10ef883c (diff) | |
download | sandcrawler-867d71bbfd89be44fedf5955a60cb1e8b774b390.tar.gz sandcrawler-867d71bbfd89be44fedf5955a60cb1e8b774b390.zip |
handle SPNv2-then-CDX lookup failures
- use a 10 second delay if CDX result isn't immediately available.
blech.
- if there is a lookup failure, call it a wayback-error and move on
-rw-r--r-- | python/sandcrawler/ia.py | 29 |
1 files changed, 23 insertions, 6 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 73f8484..0ac1320 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -718,12 +718,29 @@ class SavePageNowClient: print("Failed pdf.sciencedirectassets.com hack!", file=sys.stderr) #print(elsevier_pdf_cdx, file=sys.stderr) - # fetch exact CDX row - cdx_row = wayback_client.cdx_client.fetch( - url=spn_result.terminal_url, - datetime=spn_result.terminal_dt, - filter_status_code=200, - ) + if not cdx_row: + # lookup exact + try: + cdx_row = wayback_client.cdx_client.fetch( + url=spn_result.terminal_url, + datetime=spn_result.terminal_dt, + filter_status_code=200, + retry_sleep=10.0, + ) + except KeyError as ke: + print(str(ke), file=sys.stderr) + return ResourceResult( + start_url=start_url, + hit=False, + status="spn2-cdx-lookup-failure", + terminal_url=spn_result.terminal_url, + terminal_dt=spn_result.terminal_dt, + terminal_status_code=None, + body=None, + cdx=None, + ) + + #print(cdx_row, file=sys.stderr) if '/' in cdx_row.warc_path: # Usually can't do this kind of direct fetch because CDX result is recent/live |