diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-04-10 18:45:58 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-04-10 19:13:53 -0700 |
commit | 0a778bd6d46a71b7cbec04eb3a5bdb00d91da0de (patch) | |
tree | 2a2d4386acc27941daab1ad33b48f3226823a22e /mapreduce/extraction_cdx_grobid.py | |
parent | 41c79a8bd9b01ba52dc19e7e3ba13221bf23c560 (diff) | |
download | sandcrawler-0a778bd6d46a71b7cbec04eb3a5bdb00d91da0de.tar.gz sandcrawler-0a778bd6d46a71b7cbec04eb3a5bdb00d91da0de.zip |
wayback 404 test
Diffstat (limited to 'mapreduce/extraction_cdx_grobid.py')
-rwxr-xr-x | mapreduce/extraction_cdx_grobid.py | 3 |
1 files changed, 2 insertions, 1 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py index 9a0d795..708e170 100755 --- a/mapreduce/extraction_cdx_grobid.py +++ b/mapreduce/extraction_cdx_grobid.py @@ -109,7 +109,8 @@ class MRExtractCdxGrobid(MRJob): rstore = ResourceStore(loaderfactory=CDXLoaderFactory()) gwb_record = rstore.load_resource(warc_uri, offset, c_size) except wayback.exception.ResourceUnavailable as err: - return None, dict(status="petabox_error", reason="failed to load file contents") + return None, dict(status="error", + reason="failed to load file contents from wayback/petabox") if gwb_record.get_status()[0] != 200: return None, dict(status="error", |