aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ingest.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-10-21 12:20:52 -0700
committerBryan Newbold <bnewbold@archive.org>2020-10-21 12:20:54 -0700
commit200bf734bd459dd3c7a147b3dfe127dbf0ed7f70 (patch)
tree4f010e66a059271ac3b9c496d15a3bc90bd763c4 /python/sandcrawler/ingest.py
parent33249f2679851afb64142c428be45d16f35f5539 (diff)
downloadsandcrawler-200bf734bd459dd3c7a147b3dfe127dbf0ed7f70.tar.gz
sandcrawler-200bf734bd459dd3c7a147b3dfe127dbf0ed7f70.zip
differential wayback-error from wayback-content-error
The motivation here is to distinguish errors due to current content in wayback (eg, in WARCs) from operational errors (eg, wayback machine is down, or network failures/disruption).
Diffstat (limited to 'python/sandcrawler/ingest.py')
-rw-r--r--python/sandcrawler/ingest.py6
1 files changed, 5 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 322859a..57988e8 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -8,7 +8,7 @@ import requests
from http.server import BaseHTTPRequestHandler, HTTPServer
from collections import namedtuple
-from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult
+from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult
from sandcrawler.grobid import GrobidClient
from sandcrawler.pdfextract import process_pdf, PdfExtractResult
from sandcrawler.misc import gen_file_metadata, clean_url
@@ -388,6 +388,10 @@ class IngestFileWorker(SandcrawlerWorker):
result['status'] = 'wayback-error'
result['error_message'] = str(e)[:1600]
return result
+ except WaybackContentError as e:
+ result['status'] = 'wayback-content-error'
+ result['error_message'] = str(e)[:1600]
+ return result
except NotImplementedError as e:
result['status'] = 'not-implemented'
result['error_message'] = str(e)[:1600]