aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-10-21 12:20:52 -0700
committerBryan Newbold <bnewbold@archive.org>2020-10-21 12:20:54 -0700
commit200bf734bd459dd3c7a147b3dfe127dbf0ed7f70 (patch)
tree4f010e66a059271ac3b9c496d15a3bc90bd763c4
parent33249f2679851afb64142c428be45d16f35f5539 (diff)
downloadsandcrawler-200bf734bd459dd3c7a147b3dfe127dbf0ed7f70.tar.gz
sandcrawler-200bf734bd459dd3c7a147b3dfe127dbf0ed7f70.zip
differential wayback-error from wayback-content-error
The motivation here is to distinguish errors due to current content in wayback (eg, in WARCs) from operational errors (eg, wayback machine is down, or network failures/disruption).
-rw-r--r--python/sandcrawler/__init__.py2
-rw-r--r--python/sandcrawler/grobid.py1
-rw-r--r--python/sandcrawler/ia.py23
-rw-r--r--python/sandcrawler/ingest.py6
-rw-r--r--python/sandcrawler/pdfextract.py1
-rw-r--r--python/sandcrawler/pdftrio.py1
-rw-r--r--python/sandcrawler/workers.py6
7 files changed, 22 insertions, 18 deletions
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py
index 71c2023..e461462 100644
--- a/python/sandcrawler/__init__.py
+++ b/python/sandcrawler/__init__.py
@@ -3,7 +3,7 @@ from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker
from .pdftrio import PdfTrioClient, PdfTrioWorker, PdfTrioBlobWorker
from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url
from .workers import KafkaSink, KafkaCompressSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper
-from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow
+from .ia import WaybackClient, WaybackError, WaybackContentError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow
from .ingest import IngestFileWorker
from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker, PersistPdfTrioWorker, PersistIngestRequestWorker, PersistPdfTextWorker, PersistThumbnailWorker
from .db import SandcrawlerPostgrestClient, SandcrawlerPostgresClient
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 11623c5..b010b2c 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -4,7 +4,6 @@ import requests
from grobid2json import teixml2json
from .workers import SandcrawlerWorker, SandcrawlerFetchWorker
from .misc import gen_file_metadata
-from .ia import WaybackClient, WaybackError, PetaboxError
class GrobidClient(object):
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 59c53d0..fc51d91 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -296,6 +296,9 @@ class CdxApiClient:
class WaybackError(Exception):
pass
+class WaybackContentError(Exception):
+ pass
+
class PetaboxError(Exception):
pass
@@ -376,7 +379,7 @@ class WaybackClient:
try:
status_code = gwb_record.get_status()[0]
except http.client.HTTPException:
- raise WaybackError("too many HTTP headers (in wayback fetch)")
+ raise WaybackContentError("too many HTTP headers (in wayback fetch)")
location = gwb_record.get_location() or None
if status_code is None and gwb_record.target_uri.startswith(b"ftp://") and not gwb_record.is_revisit():
@@ -387,10 +390,10 @@ class WaybackClient:
revisit_cdx = None
if gwb_record.is_revisit():
if not resolve_revisit:
- raise WaybackError("found revisit record, but won't resolve (loop?)")
+ raise WaybackContentError("found revisit record, but won't resolve (loop?)")
revisit_uri, revisit_dt = gwb_record.refers_to
if not (revisit_uri and revisit_dt):
- raise WaybackError("revisit record missing URI and/or DT: warc:{} offset:{}".format(
+ raise WaybackContentError("revisit record missing URI and/or DT: warc:{} offset:{}".format(
warc_path, offset))
# convert revisit_dt
# len("2018-07-24T11:56:49"), or with "Z"
@@ -416,7 +419,7 @@ class WaybackClient:
raise WaybackError(
"failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
elif status_code is None:
- raise WaybackError(
+ raise WaybackContentError(
"got a None status_code in (W)ARC record")
return WarcResource(
status_code=status_code,
@@ -481,11 +484,11 @@ class WaybackClient:
headers=self.replay_headers,
)
except requests.exceptions.TooManyRedirects:
- raise WaybackError("redirect loop (wayback replay fetch)")
+ raise WaybackContentError("redirect loop (wayback replay fetch)")
except requests.exceptions.ChunkedEncodingError:
raise WaybackError("ChunkedEncodingError (wayback replay fetch)")
except UnicodeDecodeError:
- raise WaybackError("UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(url))
+ raise WaybackContentError("UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(url))
try:
resp.raise_for_status()
@@ -508,7 +511,7 @@ class WaybackClient:
cdx_sha1hex,
file_meta['sha1hex']),
file=sys.stderr)
- raise WaybackError("replay fetch body didn't match CDX hash cdx:{} body:{}".format(
+ raise WaybackContentError("replay fetch body didn't match CDX hash cdx:{} body:{}".format(
cdx_sha1hex,
file_meta['sha1hex']),
)
@@ -537,9 +540,9 @@ class WaybackClient:
headers=self.replay_headers,
)
except requests.exceptions.TooManyRedirects:
- raise WaybackError("redirect loop (wayback replay fetch)")
+ raise WaybackContentError("redirect loop (wayback replay fetch)")
except UnicodeDecodeError:
- raise WaybackError("UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(url))
+ raise WaybackContentError("UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(url))
try:
resp.raise_for_status()
except Exception as e:
@@ -1030,7 +1033,7 @@ class SavePageNowClient:
url=cdx_row.url,
datetime=cdx_row.datetime,
)
- except WaybackError as we:
+ except (WaybackError, WaybackContentError) as we:
return ResourceResult(
start_url=start_url,
hit=False,
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 322859a..57988e8 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -8,7 +8,7 @@ import requests
from http.server import BaseHTTPRequestHandler, HTTPServer
from collections import namedtuple
-from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult
+from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult
from sandcrawler.grobid import GrobidClient
from sandcrawler.pdfextract import process_pdf, PdfExtractResult
from sandcrawler.misc import gen_file_metadata, clean_url
@@ -388,6 +388,10 @@ class IngestFileWorker(SandcrawlerWorker):
result['status'] = 'wayback-error'
result['error_message'] = str(e)[:1600]
return result
+ except WaybackContentError as e:
+ result['status'] = 'wayback-content-error'
+ result['error_message'] = str(e)[:1600]
+ return result
except NotImplementedError as e:
result['status'] = 'not-implemented'
result['error_message'] = str(e)[:1600]
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index d8a90c1..70d2f93 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -11,7 +11,6 @@ from PIL import Image
from .workers import SandcrawlerWorker, SandcrawlerFetchWorker
from .misc import gen_file_metadata
-from .ia import WaybackClient, WaybackError, PetaboxError
# This is a hack to work around timeouts when processing certain PDFs with
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
index c65b6c8..161dc9c 100644
--- a/python/sandcrawler/pdftrio.py
+++ b/python/sandcrawler/pdftrio.py
@@ -4,7 +4,6 @@ import requests
from .workers import SandcrawlerWorker, SandcrawlerFetchWorker
from .misc import gen_file_metadata, requests_retry_session
-from .ia import WaybackClient, WaybackError, PetaboxError
class PdfTrioClient(object):
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index 814cbf3..37e3d7a 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -10,7 +10,7 @@ from collections import Counter
from confluent_kafka import Consumer, Producer, KafkaException
from .misc import parse_cdx_line
-from .ia import SandcrawlerBackoffError, WaybackError, PetaboxError
+from .ia import SandcrawlerBackoffError, WaybackError, WaybackContentError, PetaboxError
class SandcrawlerWorker(object):
@@ -135,7 +135,7 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
warc_path=record['warc_path'],
)
wayback_sec = time.time() - start
- except (WaybackError, PetaboxError, KeyError) as we:
+ except (WaybackError, WaybackContentError, PetaboxError, KeyError) as we:
return dict(
key=default_key,
source=record,
@@ -153,7 +153,7 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
datetime=record['datetime'],
)
wayback_sec = time.time() - start
- except WaybackError as we:
+ except (WaybackError, WaybackContentError) as we:
return dict(
key=default_key,
source=record,