aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/grobid.py3
-rw-r--r--python/sandcrawler/ia.py2
-rw-r--r--python/sandcrawler/pdfextract.py3
-rw-r--r--python/sandcrawler/pdftrio.py3
-rw-r--r--python/sandcrawler/workers.py8
5 files changed, 11 insertions, 8 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index f4d778f..67aca17 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -113,7 +113,8 @@ class GrobidWorker(SandcrawlerFetchWorker):
fetch_result = self.fetch_blob(record)
if fetch_result['status'] != 'success':
return fetch_result
- blob = fetch_result['blob']
+ blob: bytes = fetch_result['blob']
+ assert blob and isinstance(blob, bytes)
result = self.grobid_client.process_fulltext(blob,
consolidate_mode=self.consolidate_mode)
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index b413bc8..8f28d42 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -507,7 +507,7 @@ class WaybackClient:
offset: int,
warc_path: str,
resolve_revisit: bool = True,
- expected_status_code: Optional[int] = None) -> WarcResource:
+ expected_status_code: Optional[int] = None) -> bytes:
"""
Fetches HTTP 200 WARC resource directly from petabox using WARC path/offset/csize.
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index d23d231..1d306d3 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -443,7 +443,8 @@ class PdfExtractWorker(SandcrawlerFetchWorker):
fetch_result = self.fetch_blob(record)
if fetch_result['status'] != 'success':
return fetch_result
- blob = fetch_result['blob']
+ blob: bytes = fetch_result['blob']
+ assert blob and isinstance(blob, bytes)
result = process_pdf(blob)
result.source = record
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
index 7d39f0f..138e65c 100644
--- a/python/sandcrawler/pdftrio.py
+++ b/python/sandcrawler/pdftrio.py
@@ -88,7 +88,8 @@ class PdfTrioWorker(SandcrawlerFetchWorker):
fetch_sec = time.time() - start
if fetch_result['status'] != 'success':
return fetch_result
- blob = fetch_result['blob']
+ blob: bytes = fetch_result['blob']
+ assert blob and isinstance(blob, bytes)
result = dict()
result['file_meta'] = gen_file_metadata(blob)
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index ba0358f..ceb6671 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -132,7 +132,7 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
raise Exception("wayback client not configured for this SandcrawlerFetchWorker")
try:
start = time.time()
- blob = self.wayback_client.fetch_petabox_body(
+ blob: bytes = self.wayback_client.fetch_petabox_body(
csize=record['warc_csize'],
offset=record['warc_offset'],
warc_path=record['warc_path'],
@@ -166,11 +166,11 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
elif record.get('item') and record.get('path'):
# it's petabox link; fetch via HTTP
start = time.time()
- resp = requests.get("https://archive.org/serve/{}/{}".format(
+ ia_resp = requests.get("https://archive.org/serve/{}/{}".format(
record['item'], record['path']))
petabox_sec = time.time() - start
try:
- resp.raise_for_status()
+ ia_resp.raise_for_status()
except Exception as e:
return dict(
key=default_key,
@@ -178,7 +178,7 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
status="error-petabox",
error_msg=str(e),
)
- blob = resp.content
+ blob = ia_resp.content
else:
raise ValueError(
"not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed")