diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-02-24 16:50:50 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-24 16:50:50 -0800 |
commit | 6a9c1fe64941b4975fa5e52fc3df58b38bf9a0f0 (patch) | |
tree | bf47238cbf964a0894ba6989d9b4aafd3c64d98c | |
parent | 266b0a2d5928921d3b3f992fa249b22f7a5edb16 (diff) | |
download | sandcrawler-6a9c1fe64941b4975fa5e52fc3df58b38bf9a0f0.tar.gz sandcrawler-6a9c1fe64941b4975fa5e52fc3df58b38bf9a0f0.zip |
ingest: treat CDX lookup error as a wayback-error
-rw-r--r-- | python/sandcrawler/ia.py | 5 |
1 files changed, 4 insertions, 1 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index c3ca80f..7230ee0 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -381,7 +381,10 @@ class WaybackClient: revisit_uri = revisit_uri.decode('utf-8') revisit_dt = revisit_dt.decode('utf-8').replace('-', '').replace(':', '').replace('T', '').replace('Z', '') assert len(revisit_dt) == 14 - revisit_cdx = self.cdx_client.fetch(revisit_uri, revisit_dt) + try: + revisit_cdx = self.cdx_client.fetch(revisit_uri, revisit_dt) + except KeyError as ke: + raise WaybackError("Revist resolution failed: {}".format(ke)) body = self.fetch_petabox_body( csize=revisit_cdx.warc_csize, offset=revisit_cdx.warc_offset, |