aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/workers.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-25 23:04:31 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-25 23:04:33 -0700
commit615914b13e5a67bcf749cdc682c86c83c1e7bacc (patch)
tree8ad87858ef525f8c6e08a797be12573abf68d77b /python/sandcrawler/workers.py
parent96513b5cc26dd91375a6433e49d24b2d9eb2bea9 (diff)
downloadsandcrawler-615914b13e5a67bcf749cdc682c86c83c1e7bacc.tar.gz
sandcrawler-615914b13e5a67bcf749cdc682c86c83c1e7bacc.zip
don't nest generic fetch errors under pdf_trio
This came from sloppy refactoring (and missing test coverage)
Diffstat (limited to 'python/sandcrawler/workers.py')
-rw-r--r--python/sandcrawler/workers.py18
1 files changed, 6 insertions, 12 deletions
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index 4a1d7a4..907bb5a 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -139,10 +139,8 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
return dict(
key=default_key,
source=record,
- pdf_trio=dict(
- status="error-wayback",
- error_msg=str(we),
- ),
+ status="error-wayback",
+ error_msg=str(we),
)
elif record.get('url') and record.get('datetime'):
# it's a partial CDX dict or something? fetch using WaybackClient
@@ -159,10 +157,8 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
return dict(
key=default_key,
source=record,
- pdf_trio=dict(
- status="error-wayback",
- error_msg=str(we),
- ),
+ status="error-wayback",
+ error_msg=str(we),
)
elif record.get('item') and record.get('path'):
# it's petabox link; fetch via HTTP
@@ -176,10 +172,8 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
return dict(
key=default_key,
source=record,
- pdf_trio=dict(
- status="error-petabox",
- error_msg=str(e),
- ),
+ status="error-petabox",
+ error_msg=str(e),
)
blob = resp.content
else: