aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/sandcrawler/pdftrio.py32
1 files changed, 20 insertions, 12 deletions
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
index 168fb78..52d1b8d 100644
--- a/python/sandcrawler/pdftrio.py
+++ b/python/sandcrawler/pdftrio.py
@@ -90,10 +90,12 @@ class PdfTrioWorker(SandcrawlerWorker):
wayback_sec = time.time() - start
except (WaybackError, PetaboxError) as we:
return dict(
- status="error-wayback",
- error_msg=str(we),
- source=record,
key=default_key,
+ source=record,
+ pdf_trio=dict(
+ status="error-wayback",
+ error_msg=str(we),
+ ),
)
elif record.get('url') and record.get('datetime'):
# it's a partial CDX dict or something? fetch using WaybackClient
@@ -108,10 +110,12 @@ class PdfTrioWorker(SandcrawlerWorker):
wayback_sec = time.time() - start
except WaybackError as we:
return dict(
- status="error-wayback",
- error_msg=str(we),
- source=record,
key=default_key,
+ source=record,
+ pdf_trio=dict(
+ status="error-wayback",
+ error_msg=str(we),
+ ),
)
elif record.get('item') and record.get('path'):
# it's petabox link; fetch via HTTP
@@ -123,20 +127,24 @@ class PdfTrioWorker(SandcrawlerWorker):
resp.raise_for_status()
except Exception as e:
return dict(
- status="error-petabox",
- error_msg=str(e),
- source=record,
key=default_key,
+ source=record,
+ pdf_trio=dict(
+ status="error-petabox",
+ error_msg=str(e),
+ ),
)
blob = resp.content
else:
raise ValueError("not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed")
if not blob:
return dict(
- status="error",
- error_msg="empty blob",
- source=record,
key=default_key,
+ source=record,
+ pdf_trio=dict(
+ status="error",
+ error_msg="empty blob",
+ ),
)
result = dict()
result['file_meta'] = gen_file_metadata(blob)