diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-11-26 17:48:29 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-11-26 17:48:29 -0800 |
commit | 8670ba255643895f4b85ca7b17821832cc669784 (patch) | |
tree | c6d30062b84fcf3de782bf380fb82bd0b7cceab3 /python | |
parent | 9368136603513dde5757c4563d7c61e9f74f54cb (diff) | |
download | sandcrawler-8670ba255643895f4b85ca7b17821832cc669784.tar.gz sandcrawler-8670ba255643895f4b85ca7b17821832cc669784.zip |
catch more wayback error types
Diffstat (limited to 'python')
-rwxr-xr-x | python/kafka_grobid.py | 12 |
1 files changed, 11 insertions, 1 deletions
diff --git a/python/kafka_grobid.py b/python/kafka_grobid.py index 4fac7d5..75ad52d 100755 --- a/python/kafka_grobid.py +++ b/python/kafka_grobid.py @@ -41,6 +41,7 @@ import requests import argparse import pykafka import wayback.exception +from http.client import IncompleteRead from wayback.resource import Resource from wayback.resource import ArcResource from wayback.resourcestore import ResourceStore @@ -114,6 +115,9 @@ class KafkaGrobidWorker: except EOFError as eofe: return None, dict(status="error", reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe)) + except TypeError as te: + return None, dict(status="error", + reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(eofe)) # Note: could consider a generic "except Exception" here, as we get so # many petabox errors. Do want jobs to fail loud and clear when the # whole cluster is down though. @@ -122,7 +126,13 @@ class KafkaGrobidWorker: return None, dict(status="error", reason="archived HTTP response (WARC) was not 200", warc_status=gwb_record.get_status()[0]) - return gwb_record.open_raw_content().read(), None + + try: + raw_content = gwb_record.open_raw_content().read() + except IncompleteRead as ire: + return None, dict(status="error", + reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire)) + return raw_content, None def extract(self, info): |