diff options
Diffstat (limited to 'python/sandcrawler/workers.py')
-rw-r--r-- | python/sandcrawler/workers.py | 8 |
1 files changed, 4 insertions, 4 deletions
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py index ba0358f..ceb6671 100644 --- a/python/sandcrawler/workers.py +++ b/python/sandcrawler/workers.py @@ -132,7 +132,7 @@ class SandcrawlerFetchWorker(SandcrawlerWorker): raise Exception("wayback client not configured for this SandcrawlerFetchWorker") try: start = time.time() - blob = self.wayback_client.fetch_petabox_body( + blob: bytes = self.wayback_client.fetch_petabox_body( csize=record['warc_csize'], offset=record['warc_offset'], warc_path=record['warc_path'], @@ -166,11 +166,11 @@ class SandcrawlerFetchWorker(SandcrawlerWorker): elif record.get('item') and record.get('path'): # it's petabox link; fetch via HTTP start = time.time() - resp = requests.get("https://archive.org/serve/{}/{}".format( + ia_resp = requests.get("https://archive.org/serve/{}/{}".format( record['item'], record['path'])) petabox_sec = time.time() - start try: - resp.raise_for_status() + ia_resp.raise_for_status() except Exception as e: return dict( key=default_key, @@ -178,7 +178,7 @@ class SandcrawlerFetchWorker(SandcrawlerWorker): status="error-petabox", error_msg=str(e), ) - blob = resp.content + blob = ia_resp.content else: raise ValueError( "not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed") |