diff options
-rw-r--r-- | python/sandcrawler/grobid.py | 3 | ||||
-rw-r--r-- | python/sandcrawler/ia.py | 2 | ||||
-rw-r--r-- | python/sandcrawler/pdfextract.py | 3 | ||||
-rw-r--r-- | python/sandcrawler/pdftrio.py | 3 | ||||
-rw-r--r-- | python/sandcrawler/workers.py | 8 |
5 files changed, 11 insertions, 8 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index f4d778f..67aca17 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -113,7 +113,8 @@ class GrobidWorker(SandcrawlerFetchWorker): fetch_result = self.fetch_blob(record) if fetch_result['status'] != 'success': return fetch_result - blob = fetch_result['blob'] + blob: bytes = fetch_result['blob'] + assert blob and isinstance(blob, bytes) result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode) diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index b413bc8..8f28d42 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -507,7 +507,7 @@ class WaybackClient: offset: int, warc_path: str, resolve_revisit: bool = True, - expected_status_code: Optional[int] = None) -> WarcResource: + expected_status_code: Optional[int] = None) -> bytes: """ Fetches HTTP 200 WARC resource directly from petabox using WARC path/offset/csize. diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py index d23d231..1d306d3 100644 --- a/python/sandcrawler/pdfextract.py +++ b/python/sandcrawler/pdfextract.py @@ -443,7 +443,8 @@ class PdfExtractWorker(SandcrawlerFetchWorker): fetch_result = self.fetch_blob(record) if fetch_result['status'] != 'success': return fetch_result - blob = fetch_result['blob'] + blob: bytes = fetch_result['blob'] + assert blob and isinstance(blob, bytes) result = process_pdf(blob) result.source = record diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py index 7d39f0f..138e65c 100644 --- a/python/sandcrawler/pdftrio.py +++ b/python/sandcrawler/pdftrio.py @@ -88,7 +88,8 @@ class PdfTrioWorker(SandcrawlerFetchWorker): fetch_sec = time.time() - start if fetch_result['status'] != 'success': return fetch_result - blob = fetch_result['blob'] + blob: bytes = fetch_result['blob'] + assert blob and isinstance(blob, bytes) result = dict() result['file_meta'] = gen_file_metadata(blob) diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py index ba0358f..ceb6671 100644 --- a/python/sandcrawler/workers.py +++ b/python/sandcrawler/workers.py @@ -132,7 +132,7 @@ class SandcrawlerFetchWorker(SandcrawlerWorker): raise Exception("wayback client not configured for this SandcrawlerFetchWorker") try: start = time.time() - blob = self.wayback_client.fetch_petabox_body( + blob: bytes = self.wayback_client.fetch_petabox_body( csize=record['warc_csize'], offset=record['warc_offset'], warc_path=record['warc_path'], @@ -166,11 +166,11 @@ class SandcrawlerFetchWorker(SandcrawlerWorker): elif record.get('item') and record.get('path'): # it's petabox link; fetch via HTTP start = time.time() - resp = requests.get("https://archive.org/serve/{}/{}".format( + ia_resp = requests.get("https://archive.org/serve/{}/{}".format( record['item'], record['path'])) petabox_sec = time.time() - start try: - resp.raise_for_status() + ia_resp.raise_for_status() except Exception as e: return dict( key=default_key, @@ -178,7 +178,7 @@ class SandcrawlerFetchWorker(SandcrawlerWorker): status="error-petabox", error_msg=str(e), ) - blob = resp.content + blob = ia_resp.content else: raise ValueError( "not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed") |