aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ingest_file.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-03-22 15:57:12 -0700
committerBryan Newbold <bnewbold@archive.org>2022-03-22 16:01:29 -0700
commitfd6dc7f36aecb6a303513476825cfe681500f02d (patch)
treeb5d7dc242d5c5cef0df300f78379ceb13dd35c63 /python/sandcrawler/ingest_file.py
parent257f41b174e04957aecf298b3ecdaae0ab44a1d2 (diff)
downloadsandcrawler-fd6dc7f36aecb6a303513476825cfe681500f02d.tar.gz
sandcrawler-fd6dc7f36aecb6a303513476825cfe681500f02d.zip
file ingest: don't 'backoff' on spn2 backoff error
The intent of this is to try and get through the daily ingest requests faster, so we can loop and retry if needed. A 200 second delay, usually resulting in a kafka topic reshuffle, really slows things down. This will presumably result in a bunch of spn2-backoff status requests, but we can just retry those.
Diffstat (limited to 'python/sandcrawler/ingest_file.py')
-rw-r--r--python/sandcrawler/ingest_file.py7
1 files changed, 7 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index 857a212..4ec37c1 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -20,6 +20,7 @@ from sandcrawler.ia import (
NoCaptureError,
PetaboxError,
ResourceResult,
+ SavePageNowBackoffError,
SavePageNowClient,
SavePageNowError,
WaybackClient,
@@ -632,6 +633,12 @@ class IngestFileWorker(SandcrawlerWorker):
result["status"] = "spn2-error"
result["error_message"] = str(e)[:1600]
return result
+ except SavePageNowBackoffError as e:
+ result["status"] = "spn2-backoff"
+ result["error_message"] = str(e)[:1600]
+ # small sleep as a slow-down
+ time.sleep(2.0)
+ return result
except PetaboxError as e:
result["status"] = "petabox-error"
result["error_message"] = str(e)[:1600]