aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-09-17 14:29:53 -0700
committerBryan Newbold <bnewbold@archive.org>2018-09-17 14:29:53 -0700
commit4dc09571ca7824bcd8d39f8676fa6635081303b8 (patch)
treee5e8459434a4d687da9d216f5c2429fc413dfe90
parent33a8f5b630ff52fcce10abfc272e2d8607ff591b (diff)
downloadsandcrawler-4dc09571ca7824bcd8d39f8676fa6635081303b8.tar.gz
sandcrawler-4dc09571ca7824bcd8d39f8676fa6635081303b8.zip
more robust extraction code (against petabox failures)
-rwxr-xr-xpython/extraction_cdx_grobid.py11
-rwxr-xr-xpython/extraction_ungrobided.py11
2 files changed, 20 insertions, 2 deletions
diff --git a/python/extraction_cdx_grobid.py b/python/extraction_cdx_grobid.py
index b27d053..76780b0 100755
--- a/python/extraction_cdx_grobid.py
+++ b/python/extraction_cdx_grobid.py
@@ -123,7 +123,16 @@ class MRExtractCdxGrobid(MRJob):
gwb_record = rstore.load_resource(warc_uri, offset, c_size)
except wayback.exception.ResourceUnavailable:
return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox")
+ reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+ except ValueError as ve:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+ except EOFError as eofe:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+ # Note: could consider a generic "except Exception" here, as we get so
+ # many petabox errors. Do want jobs to fail loud and clear when the
+ # whole cluster is down though.
if gwb_record.get_status()[0] != 200:
return None, dict(status="error",
diff --git a/python/extraction_ungrobided.py b/python/extraction_ungrobided.py
index 972b8f9..af38cea 100755
--- a/python/extraction_ungrobided.py
+++ b/python/extraction_ungrobided.py
@@ -126,7 +126,16 @@ class MRExtractUnGrobided(MRJob):
gwb_record = rstore.load_resource(warc_uri, offset, c_size)
except wayback.exception.ResourceUnavailable:
return None, dict(status="error",
- reason="failed to load file contents from wayback/petabox")
+ reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+ except ValueError as ve:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+ except EOFError as eofe:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+ # Note: could consider a generic "except Exception" here, as we get so
+ # many petabox errors. Do want jobs to fail loud and clear when the
+ # whole cluster is down though.
if gwb_record.get_status()[0] != 200:
return None, dict(status="error",