diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-02-18 16:44:42 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-18 16:44:42 -0800 |
commit | 505a7253abb41d55ec0004e26cfdef033c9c8c74 (patch) | |
tree | abbc0158d6efef43e1a7fc4ead9f51a2c3019f85 | |
parent | 832a9e42bc068c1b1656526b4a2cb7108c9b8334 (diff) | |
download | sandcrawler-505a7253abb41d55ec0004e26cfdef033c9c8c74.tar.gz sandcrawler-505a7253abb41d55ec0004e26cfdef033c9c8c74.zip |
pdftrio: fix error nesting in pdftrio key
-rw-r--r-- | python/sandcrawler/pdftrio.py | 32 |
1 files changed, 20 insertions, 12 deletions
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py index 168fb78..52d1b8d 100644 --- a/python/sandcrawler/pdftrio.py +++ b/python/sandcrawler/pdftrio.py @@ -90,10 +90,12 @@ class PdfTrioWorker(SandcrawlerWorker): wayback_sec = time.time() - start except (WaybackError, PetaboxError) as we: return dict( - status="error-wayback", - error_msg=str(we), - source=record, key=default_key, + source=record, + pdf_trio=dict( + status="error-wayback", + error_msg=str(we), + ), ) elif record.get('url') and record.get('datetime'): # it's a partial CDX dict or something? fetch using WaybackClient @@ -108,10 +110,12 @@ class PdfTrioWorker(SandcrawlerWorker): wayback_sec = time.time() - start except WaybackError as we: return dict( - status="error-wayback", - error_msg=str(we), - source=record, key=default_key, + source=record, + pdf_trio=dict( + status="error-wayback", + error_msg=str(we), + ), ) elif record.get('item') and record.get('path'): # it's petabox link; fetch via HTTP @@ -123,20 +127,24 @@ class PdfTrioWorker(SandcrawlerWorker): resp.raise_for_status() except Exception as e: return dict( - status="error-petabox", - error_msg=str(e), - source=record, key=default_key, + source=record, + pdf_trio=dict( + status="error-petabox", + error_msg=str(e), + ), ) blob = resp.content else: raise ValueError("not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed") if not blob: return dict( - status="error", - error_msg="empty blob", - source=record, key=default_key, + source=record, + pdf_trio=dict( + status="error", + error_msg="empty blob", + ), ) result = dict() result['file_meta'] = gen_file_metadata(blob) |