diff options
Diffstat (limited to 'python')
| -rw-r--r-- | python/sandcrawler/ia.py | 29 | 
1 files changed, 23 insertions, 6 deletions
| diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 73f8484..0ac1320 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -718,12 +718,29 @@ class SavePageNowClient:                  print("Failed pdf.sciencedirectassets.com hack!", file=sys.stderr)                  #print(elsevier_pdf_cdx, file=sys.stderr) -        # fetch exact CDX row -        cdx_row = wayback_client.cdx_client.fetch( -            url=spn_result.terminal_url, -            datetime=spn_result.terminal_dt, -            filter_status_code=200, -        ) +        if not cdx_row: +            # lookup exact +            try: +                cdx_row = wayback_client.cdx_client.fetch( +                    url=spn_result.terminal_url, +                    datetime=spn_result.terminal_dt, +                    filter_status_code=200, +                    retry_sleep=10.0, +                ) +            except KeyError as ke: +                print(str(ke), file=sys.stderr) +                return ResourceResult( +                    start_url=start_url, +                    hit=False, +                    status="spn2-cdx-lookup-failure", +                    terminal_url=spn_result.terminal_url, +                    terminal_dt=spn_result.terminal_dt, +                    terminal_status_code=None, +                    body=None, +                    cdx=None, +                ) + +        #print(cdx_row, file=sys.stderr)          if '/' in cdx_row.warc_path:              # Usually can't do this kind of direct fetch because CDX result is recent/live | 
