aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-18 16:44:42 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-18 16:44:42 -0800
commit505a7253abb41d55ec0004e26cfdef033c9c8c74 (patch)
treeabbc0158d6efef43e1a7fc4ead9f51a2c3019f85 /python
parent832a9e42bc068c1b1656526b4a2cb7108c9b8334 (diff)
downloadsandcrawler-505a7253abb41d55ec0004e26cfdef033c9c8c74.tar.gz
sandcrawler-505a7253abb41d55ec0004e26cfdef033c9c8c74.zip
pdftrio: fix error nesting in pdftrio key
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/pdftrio.py32
1 files changed, 20 insertions, 12 deletions
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
index 168fb78..52d1b8d 100644
--- a/python/sandcrawler/pdftrio.py
+++ b/python/sandcrawler/pdftrio.py
@@ -90,10 +90,12 @@ class PdfTrioWorker(SandcrawlerWorker):
wayback_sec = time.time() - start
except (WaybackError, PetaboxError) as we:
return dict(
- status="error-wayback",
- error_msg=str(we),
- source=record,
key=default_key,
+ source=record,
+ pdf_trio=dict(
+ status="error-wayback",
+ error_msg=str(we),
+ ),
)
elif record.get('url') and record.get('datetime'):
# it's a partial CDX dict or something? fetch using WaybackClient
@@ -108,10 +110,12 @@ class PdfTrioWorker(SandcrawlerWorker):
wayback_sec = time.time() - start
except WaybackError as we:
return dict(
- status="error-wayback",
- error_msg=str(we),
- source=record,
key=default_key,
+ source=record,
+ pdf_trio=dict(
+ status="error-wayback",
+ error_msg=str(we),
+ ),
)
elif record.get('item') and record.get('path'):
# it's petabox link; fetch via HTTP
@@ -123,20 +127,24 @@ class PdfTrioWorker(SandcrawlerWorker):
resp.raise_for_status()
except Exception as e:
return dict(
- status="error-petabox",
- error_msg=str(e),
- source=record,
key=default_key,
+ source=record,
+ pdf_trio=dict(
+ status="error-petabox",
+ error_msg=str(e),
+ ),
)
blob = resp.content
else:
raise ValueError("not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed")
if not blob:
return dict(
- status="error",
- error_msg="empty blob",
- source=record,
key=default_key,
+ source=record,
+ pdf_trio=dict(
+ status="error",
+ error_msg="empty blob",
+ ),
)
result = dict()
result['file_meta'] = gen_file_metadata(blob)