From aabb14864abcff9057871904b3da848e5b14acd9 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Tue, 26 Oct 2021 18:31:17 -0700
Subject: fix type annotations for petabox body fetch helper

---
 python/sandcrawler/grobid.py     | 3 ++-
 python/sandcrawler/ia.py         | 2 +-
 python/sandcrawler/pdfextract.py | 3 ++-
 python/sandcrawler/pdftrio.py    | 3 ++-
 python/sandcrawler/workers.py    | 8 ++++----
 5 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'python')

diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index f4d778f..67aca17 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -113,7 +113,8 @@ class GrobidWorker(SandcrawlerFetchWorker):
         fetch_result = self.fetch_blob(record)
         if fetch_result['status'] != 'success':
             return fetch_result
-        blob = fetch_result['blob']
+        blob: bytes = fetch_result['blob']
+        assert blob and isinstance(blob, bytes)
 
         result = self.grobid_client.process_fulltext(blob,
                                                      consolidate_mode=self.consolidate_mode)
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index b413bc8..8f28d42 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -507,7 +507,7 @@ class WaybackClient:
                            offset: int,
                            warc_path: str,
                            resolve_revisit: bool = True,
-                           expected_status_code: Optional[int] = None) -> WarcResource:
+                           expected_status_code: Optional[int] = None) -> bytes:
         """
         Fetches HTTP 200 WARC resource directly from petabox using WARC path/offset/csize.
 
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index d23d231..1d306d3 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -443,7 +443,8 @@ class PdfExtractWorker(SandcrawlerFetchWorker):
         fetch_result = self.fetch_blob(record)
         if fetch_result['status'] != 'success':
             return fetch_result
-        blob = fetch_result['blob']
+        blob: bytes = fetch_result['blob']
+        assert blob and isinstance(blob, bytes)
 
         result = process_pdf(blob)
         result.source = record
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
index 7d39f0f..138e65c 100644
--- a/python/sandcrawler/pdftrio.py
+++ b/python/sandcrawler/pdftrio.py
@@ -88,7 +88,8 @@ class PdfTrioWorker(SandcrawlerFetchWorker):
         fetch_sec = time.time() - start
         if fetch_result['status'] != 'success':
             return fetch_result
-        blob = fetch_result['blob']
+        blob: bytes = fetch_result['blob']
+        assert blob and isinstance(blob, bytes)
 
         result = dict()
         result['file_meta'] = gen_file_metadata(blob)
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index ba0358f..ceb6671 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -132,7 +132,7 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
                 raise Exception("wayback client not configured for this SandcrawlerFetchWorker")
             try:
                 start = time.time()
-                blob = self.wayback_client.fetch_petabox_body(
+                blob: bytes = self.wayback_client.fetch_petabox_body(
                     csize=record['warc_csize'],
                     offset=record['warc_offset'],
                     warc_path=record['warc_path'],
@@ -166,11 +166,11 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
         elif record.get('item') and record.get('path'):
             # it's petabox link; fetch via HTTP
             start = time.time()
-            resp = requests.get("https://archive.org/serve/{}/{}".format(
+            ia_resp = requests.get("https://archive.org/serve/{}/{}".format(
                 record['item'], record['path']))
             petabox_sec = time.time() - start
             try:
-                resp.raise_for_status()
+                ia_resp.raise_for_status()
             except Exception as e:
                 return dict(
                     key=default_key,
@@ -178,7 +178,7 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
                     status="error-petabox",
                     error_msg=str(e),
                 )
-            blob = resp.content
+            blob = ia_resp.content
         else:
             raise ValueError(
                 "not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed")
-- 
cgit v1.2.3