diff options
| -rw-r--r-- | python/sandcrawler/__init__.py | 2 | ||||
| -rw-r--r-- | python/sandcrawler/ia.py | 12 | 
2 files changed, 11 insertions, 3 deletions
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py index e8fbcdf..c9cc0c9 100644 --- a/python/sandcrawler/__init__.py +++ b/python/sandcrawler/__init__.py @@ -2,6 +2,6 @@  from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker  from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime  from .workers import KafkaSink, KafkaGrobidSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper -from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError +from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, SavePageNowRemoteError  from .ingest import IngestFileWorker diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 455c9f6..489736e 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -126,6 +126,9 @@ class WaybackClient:  class SavePageNowError(Exception):      pass +class SavePageNowRemoteError(Exception): +    pass +  class SavePageNowClient:      def __init__(self, cdx_client=None, @@ -156,13 +159,18 @@ class SavePageNowClient:          error on non-success.          """          resp = self.http_session.get(self.v1endpoint + url) -        if resp.status_code != 200: +        if resp.status_code != 200 and not resp.headers.get('X-Archive-Orig-Location'): +            # looks like an error which was *not* a remote server error. Some +            # problem with wayback, might need to short-circuit              raise SavePageNowError("HTTP status: {}, url: {}".format(resp.status_code, url)) +        if resp.headers.get('X-Archive-Wayback-Runtime-Error'): +            # looks like a weird remote error; would not expect a CDX reply so bailing here +            raise SavePageNowRemoteError(resp.headers['X-Archive-Wayback-Runtime-Error'])          terminal_url = '/'.join(resp.url.split('/')[5:])          body = resp.content          cdx = self.cdx_client.lookup_latest(terminal_url)          if not cdx: -            raise SavePageNowError("SPN was successful, but CDX lookup then failed") +            raise SavePageNowError("SPN was successful, but CDX lookup then failed. URL: {}".format(terminal_url))          return (cdx, body)      def save_url_now_v2(self, url):  | 
