diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-09-17 14:29:53 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-09-17 14:29:53 -0700 |
commit | 4dc09571ca7824bcd8d39f8676fa6635081303b8 (patch) | |
tree | e5e8459434a4d687da9d216f5c2429fc413dfe90 | |
parent | 33a8f5b630ff52fcce10abfc272e2d8607ff591b (diff) | |
download | sandcrawler-4dc09571ca7824bcd8d39f8676fa6635081303b8.tar.gz sandcrawler-4dc09571ca7824bcd8d39f8676fa6635081303b8.zip |
more robust extraction code (against petabox failures)
-rwxr-xr-x | python/extraction_cdx_grobid.py | 11 | ||||
-rwxr-xr-x | python/extraction_ungrobided.py | 11 |
2 files changed, 20 insertions, 2 deletions
diff --git a/python/extraction_cdx_grobid.py b/python/extraction_cdx_grobid.py index b27d053..76780b0 100755 --- a/python/extraction_cdx_grobid.py +++ b/python/extraction_cdx_grobid.py @@ -123,7 +123,16 @@ class MRExtractCdxGrobid(MRJob): gwb_record = rstore.load_resource(warc_uri, offset, c_size) except wayback.exception.ResourceUnavailable: return None, dict(status="error", - reason="failed to load file contents from wayback/petabox") + reason="failed to load file contents from wayback/petabox (ResourceUnavailable)") + except ValueError as ve: + return None, dict(status="error", + reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve)) + except EOFError as eofe: + return None, dict(status="error", + reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe)) + # Note: could consider a generic "except Exception" here, as we get so + # many petabox errors. Do want jobs to fail loud and clear when the + # whole cluster is down though. if gwb_record.get_status()[0] != 200: return None, dict(status="error", diff --git a/python/extraction_ungrobided.py b/python/extraction_ungrobided.py index 972b8f9..af38cea 100755 --- a/python/extraction_ungrobided.py +++ b/python/extraction_ungrobided.py @@ -126,7 +126,16 @@ class MRExtractUnGrobided(MRJob): gwb_record = rstore.load_resource(warc_uri, offset, c_size) except wayback.exception.ResourceUnavailable: return None, dict(status="error", - reason="failed to load file contents from wayback/petabox") + reason="failed to load file contents from wayback/petabox (ResourceUnavailable)") + except ValueError as ve: + return None, dict(status="error", + reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve)) + except EOFError as eofe: + return None, dict(status="error", + reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe)) + # Note: could consider a generic "except Exception" here, as we get so + # many petabox errors. Do want jobs to fail loud and clear when the + # whole cluster is down though. if gwb_record.get_status()[0] != 200: return None, dict(status="error", |