aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/__init__.py2
-rw-r--r--python/sandcrawler/ia.py12
2 files changed, 11 insertions, 3 deletions
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py
index e8fbcdf..c9cc0c9 100644
--- a/python/sandcrawler/__init__.py
+++ b/python/sandcrawler/__init__.py
@@ -2,6 +2,6 @@
from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker
from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime
from .workers import KafkaSink, KafkaGrobidSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper
-from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError
+from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, SavePageNowRemoteError
from .ingest import IngestFileWorker
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 455c9f6..489736e 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -126,6 +126,9 @@ class WaybackClient:
class SavePageNowError(Exception):
pass
+class SavePageNowRemoteError(Exception):
+ pass
+
class SavePageNowClient:
def __init__(self, cdx_client=None,
@@ -156,13 +159,18 @@ class SavePageNowClient:
error on non-success.
"""
resp = self.http_session.get(self.v1endpoint + url)
- if resp.status_code != 200:
+ if resp.status_code != 200 and not resp.headers.get('X-Archive-Orig-Location'):
+ # looks like an error which was *not* a remote server error. Some
+ # problem with wayback, might need to short-circuit
raise SavePageNowError("HTTP status: {}, url: {}".format(resp.status_code, url))
+ if resp.headers.get('X-Archive-Wayback-Runtime-Error'):
+ # looks like a weird remote error; would not expect a CDX reply so bailing here
+ raise SavePageNowRemoteError(resp.headers['X-Archive-Wayback-Runtime-Error'])
terminal_url = '/'.join(resp.url.split('/')[5:])
body = resp.content
cdx = self.cdx_client.lookup_latest(terminal_url)
if not cdx:
- raise SavePageNowError("SPN was successful, but CDX lookup then failed")
+ raise SavePageNowError("SPN was successful, but CDX lookup then failed. URL: {}".format(terminal_url))
return (cdx, body)
def save_url_now_v2(self, url):