aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xpython/extraction_cdx_grobid.py11
-rwxr-xr-xpython/extraction_ungrobided.py11
2 files changed, 20 insertions, 2 deletions
diff --git a/python/extraction_cdx_grobid.py b/python/extraction_cdx_grobid.py
index b27d053..76780b0 100755
--- a/python/extraction_cdx_grobid.py
+++ b/python/extraction_cdx_grobid.py
@@ -123,7 +123,16 @@ class MRExtractCdxGrobid(MRJob):
gwb_record = rstore.load_resource(warc_uri, offset, c_size)
except wayback.exception.ResourceUnavailable:
return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox")
+ reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+ except ValueError as ve:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+ except EOFError as eofe:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+ # Note: could consider a generic "except Exception" here, as we get so
+ # many petabox errors. Do want jobs to fail loud and clear when the
+ # whole cluster is down though.
if gwb_record.get_status()[0] != 200:
return None, dict(status="error",
diff --git a/python/extraction_ungrobided.py b/python/extraction_ungrobided.py
index 972b8f9..af38cea 100755
--- a/python/extraction_ungrobided.py
+++ b/python/extraction_ungrobided.py
@@ -126,7 +126,16 @@ class MRExtractUnGrobided(MRJob):
gwb_record = rstore.load_resource(warc_uri, offset, c_size)
except wayback.exception.ResourceUnavailable:
return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox")
+ reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+ except ValueError as ve:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+ except EOFError as eofe:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+ # Note: could consider a generic "except Exception" here, as we get so
+ # many petabox errors. Do want jobs to fail loud and clear when the
+ # whole cluster is down though.
if gwb_record.get_status()[0] != 200:
return None, dict(status="error",