aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-03-22 15:57:12 -0700
committerBryan Newbold <bnewbold@archive.org>2022-03-22 16:01:29 -0700
commitfd6dc7f36aecb6a303513476825cfe681500f02d (patch)
treeb5d7dc242d5c5cef0df300f78379ceb13dd35c63 /python
parent257f41b174e04957aecf298b3ecdaae0ab44a1d2 (diff)
downloadsandcrawler-fd6dc7f36aecb6a303513476825cfe681500f02d.tar.gz
sandcrawler-fd6dc7f36aecb6a303513476825cfe681500f02d.zip
file ingest: don't 'backoff' on spn2 backoff error
The intent of this is to try and get through the daily ingest requests faster, so we can loop and retry if needed. A 200 second delay, usually resulting in a kafka topic reshuffle, really slows things down. This will presumably result in a bunch of spn2-backoff status requests, but we can just retry those.
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ia.py1
-rw-r--r--python/sandcrawler/ingest_file.py7
2 files changed, 8 insertions, 0 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index d123c9f..641aa52 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -1057,6 +1057,7 @@ class SavePageNowClient:
job_id = resp_json["job_id"]
print(f" SPNv2 running: job_id={job_id} url={request_url}", file=sys.stderr)
+ time.sleep(0.1)
# poll until complete
final_json = None
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index 857a212..4ec37c1 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -20,6 +20,7 @@ from sandcrawler.ia import (
NoCaptureError,
PetaboxError,
ResourceResult,
+ SavePageNowBackoffError,
SavePageNowClient,
SavePageNowError,
WaybackClient,
@@ -632,6 +633,12 @@ class IngestFileWorker(SandcrawlerWorker):
result["status"] = "spn2-error"
result["error_message"] = str(e)[:1600]
return result
+ except SavePageNowBackoffError as e:
+ result["status"] = "spn2-backoff"
+ result["error_message"] = str(e)[:1600]
+ # small sleep as a slow-down
+ time.sleep(2.0)
+ return result
except PetaboxError as e:
result["status"] = "petabox-error"
result["error_message"] = str(e)[:1600]