From 4dc09571ca7824bcd8d39f8676fa6635081303b8 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 17 Sep 2018 14:29:53 -0700 Subject: more robust extraction code (against petabox failures) --- python/extraction_cdx_grobid.py | 11 ++++++++++- python/extraction_ungrobided.py | 11 ++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/python/extraction_cdx_grobid.py b/python/extraction_cdx_grobid.py index b27d053..76780b0 100755 --- a/python/extraction_cdx_grobid.py +++ b/python/extraction_cdx_grobid.py @@ -123,7 +123,16 @@ class MRExtractCdxGrobid(MRJob): gwb_record = rstore.load_resource(warc_uri, offset, c_size) except wayback.exception.ResourceUnavailable: return None, dict(status="error", - reason="failed to load file contents from wayback/petabox") + reason="failed to load file contents from wayback/petabox (ResourceUnavailable)") + except ValueError as ve: + return None, dict(status="error", + reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve)) + except EOFError as eofe: + return None, dict(status="error", + reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe)) + # Note: could consider a generic "except Exception" here, as we get so + # many petabox errors. Do want jobs to fail loud and clear when the + # whole cluster is down though. if gwb_record.get_status()[0] != 200: return None, dict(status="error", diff --git a/python/extraction_ungrobided.py b/python/extraction_ungrobided.py index 972b8f9..af38cea 100755 --- a/python/extraction_ungrobided.py +++ b/python/extraction_ungrobided.py @@ -126,7 +126,16 @@ class MRExtractUnGrobided(MRJob): gwb_record = rstore.load_resource(warc_uri, offset, c_size) except wayback.exception.ResourceUnavailable: return None, dict(status="error", - reason="failed to load file contents from wayback/petabox") + reason="failed to load file contents from wayback/petabox (ResourceUnavailable)") + except ValueError as ve: + return None, dict(status="error", + reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve)) + except EOFError as eofe: + return None, dict(status="error", + reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe)) + # Note: could consider a generic "except Exception" here, as we get so + # many petabox errors. Do want jobs to fail loud and clear when the + # whole cluster is down though. if gwb_record.get_status()[0] != 200: return None, dict(status="error", -- cgit v1.2.3