diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-10-21 12:20:52 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-10-21 12:20:54 -0700 |
commit | 200bf734bd459dd3c7a147b3dfe127dbf0ed7f70 (patch) | |
tree | 4f010e66a059271ac3b9c496d15a3bc90bd763c4 | |
parent | 33249f2679851afb64142c428be45d16f35f5539 (diff) | |
download | sandcrawler-200bf734bd459dd3c7a147b3dfe127dbf0ed7f70.tar.gz sandcrawler-200bf734bd459dd3c7a147b3dfe127dbf0ed7f70.zip |
differential wayback-error from wayback-content-error
The motivation here is to distinguish errors due to current content in
wayback (eg, in WARCs) from operational errors (eg, wayback machine is
down, or network failures/disruption).
-rw-r--r-- | python/sandcrawler/__init__.py | 2 | ||||
-rw-r--r-- | python/sandcrawler/grobid.py | 1 | ||||
-rw-r--r-- | python/sandcrawler/ia.py | 23 | ||||
-rw-r--r-- | python/sandcrawler/ingest.py | 6 | ||||
-rw-r--r-- | python/sandcrawler/pdfextract.py | 1 | ||||
-rw-r--r-- | python/sandcrawler/pdftrio.py | 1 | ||||
-rw-r--r-- | python/sandcrawler/workers.py | 6 |
7 files changed, 22 insertions, 18 deletions
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py index 71c2023..e461462 100644 --- a/python/sandcrawler/__init__.py +++ b/python/sandcrawler/__init__.py @@ -3,7 +3,7 @@ from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker from .pdftrio import PdfTrioClient, PdfTrioWorker, PdfTrioBlobWorker from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url from .workers import KafkaSink, KafkaCompressSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper -from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow +from .ia import WaybackClient, WaybackError, WaybackContentError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow from .ingest import IngestFileWorker from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker, PersistPdfTrioWorker, PersistIngestRequestWorker, PersistPdfTextWorker, PersistThumbnailWorker from .db import SandcrawlerPostgrestClient, SandcrawlerPostgresClient diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index 11623c5..b010b2c 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -4,7 +4,6 @@ import requests from grobid2json import teixml2json from .workers import SandcrawlerWorker, SandcrawlerFetchWorker from .misc import gen_file_metadata -from .ia import WaybackClient, WaybackError, PetaboxError class GrobidClient(object): diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 59c53d0..fc51d91 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -296,6 +296,9 @@ class CdxApiClient: class WaybackError(Exception): pass +class WaybackContentError(Exception): + pass + class PetaboxError(Exception): pass @@ -376,7 +379,7 @@ class WaybackClient: try: status_code = gwb_record.get_status()[0] except http.client.HTTPException: - raise WaybackError("too many HTTP headers (in wayback fetch)") + raise WaybackContentError("too many HTTP headers (in wayback fetch)") location = gwb_record.get_location() or None if status_code is None and gwb_record.target_uri.startswith(b"ftp://") and not gwb_record.is_revisit(): @@ -387,10 +390,10 @@ class WaybackClient: revisit_cdx = None if gwb_record.is_revisit(): if not resolve_revisit: - raise WaybackError("found revisit record, but won't resolve (loop?)") + raise WaybackContentError("found revisit record, but won't resolve (loop?)") revisit_uri, revisit_dt = gwb_record.refers_to if not (revisit_uri and revisit_dt): - raise WaybackError("revisit record missing URI and/or DT: warc:{} offset:{}".format( + raise WaybackContentError("revisit record missing URI and/or DT: warc:{} offset:{}".format( warc_path, offset)) # convert revisit_dt # len("2018-07-24T11:56:49"), or with "Z" @@ -416,7 +419,7 @@ class WaybackClient: raise WaybackError( "failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire)) elif status_code is None: - raise WaybackError( + raise WaybackContentError( "got a None status_code in (W)ARC record") return WarcResource( status_code=status_code, @@ -481,11 +484,11 @@ class WaybackClient: headers=self.replay_headers, ) except requests.exceptions.TooManyRedirects: - raise WaybackError("redirect loop (wayback replay fetch)") + raise WaybackContentError("redirect loop (wayback replay fetch)") except requests.exceptions.ChunkedEncodingError: raise WaybackError("ChunkedEncodingError (wayback replay fetch)") except UnicodeDecodeError: - raise WaybackError("UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(url)) + raise WaybackContentError("UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(url)) try: resp.raise_for_status() @@ -508,7 +511,7 @@ class WaybackClient: cdx_sha1hex, file_meta['sha1hex']), file=sys.stderr) - raise WaybackError("replay fetch body didn't match CDX hash cdx:{} body:{}".format( + raise WaybackContentError("replay fetch body didn't match CDX hash cdx:{} body:{}".format( cdx_sha1hex, file_meta['sha1hex']), ) @@ -537,9 +540,9 @@ class WaybackClient: headers=self.replay_headers, ) except requests.exceptions.TooManyRedirects: - raise WaybackError("redirect loop (wayback replay fetch)") + raise WaybackContentError("redirect loop (wayback replay fetch)") except UnicodeDecodeError: - raise WaybackError("UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(url)) + raise WaybackContentError("UnicodeDecodeError in replay request (can mean nasty redirect URL): {}".format(url)) try: resp.raise_for_status() except Exception as e: @@ -1030,7 +1033,7 @@ class SavePageNowClient: url=cdx_row.url, datetime=cdx_row.datetime, ) - except WaybackError as we: + except (WaybackError, WaybackContentError) as we: return ResourceResult( start_url=start_url, hit=False, diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 322859a..57988e8 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -8,7 +8,7 @@ import requests from http.server import BaseHTTPRequestHandler, HTTPServer from collections import namedtuple -from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult +from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult from sandcrawler.grobid import GrobidClient from sandcrawler.pdfextract import process_pdf, PdfExtractResult from sandcrawler.misc import gen_file_metadata, clean_url @@ -388,6 +388,10 @@ class IngestFileWorker(SandcrawlerWorker): result['status'] = 'wayback-error' result['error_message'] = str(e)[:1600] return result + except WaybackContentError as e: + result['status'] = 'wayback-content-error' + result['error_message'] = str(e)[:1600] + return result except NotImplementedError as e: result['status'] = 'not-implemented' result['error_message'] = str(e)[:1600] diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py index d8a90c1..70d2f93 100644 --- a/python/sandcrawler/pdfextract.py +++ b/python/sandcrawler/pdfextract.py @@ -11,7 +11,6 @@ from PIL import Image from .workers import SandcrawlerWorker, SandcrawlerFetchWorker from .misc import gen_file_metadata -from .ia import WaybackClient, WaybackError, PetaboxError # This is a hack to work around timeouts when processing certain PDFs with diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py index c65b6c8..161dc9c 100644 --- a/python/sandcrawler/pdftrio.py +++ b/python/sandcrawler/pdftrio.py @@ -4,7 +4,6 @@ import requests from .workers import SandcrawlerWorker, SandcrawlerFetchWorker from .misc import gen_file_metadata, requests_retry_session -from .ia import WaybackClient, WaybackError, PetaboxError class PdfTrioClient(object): diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py index 814cbf3..37e3d7a 100644 --- a/python/sandcrawler/workers.py +++ b/python/sandcrawler/workers.py @@ -10,7 +10,7 @@ from collections import Counter from confluent_kafka import Consumer, Producer, KafkaException from .misc import parse_cdx_line -from .ia import SandcrawlerBackoffError, WaybackError, PetaboxError +from .ia import SandcrawlerBackoffError, WaybackError, WaybackContentError, PetaboxError class SandcrawlerWorker(object): @@ -135,7 +135,7 @@ class SandcrawlerFetchWorker(SandcrawlerWorker): warc_path=record['warc_path'], ) wayback_sec = time.time() - start - except (WaybackError, PetaboxError, KeyError) as we: + except (WaybackError, WaybackContentError, PetaboxError, KeyError) as we: return dict( key=default_key, source=record, @@ -153,7 +153,7 @@ class SandcrawlerFetchWorker(SandcrawlerWorker): datetime=record['datetime'], ) wayback_sec = time.time() - start - except WaybackError as we: + except (WaybackError, WaybackContentError) as we: return dict( key=default_key, source=record, |