diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-03-22 15:57:12 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-03-22 16:01:29 -0700 |
commit | fd6dc7f36aecb6a303513476825cfe681500f02d (patch) | |
tree | b5d7dc242d5c5cef0df300f78379ceb13dd35c63 | |
parent | 257f41b174e04957aecf298b3ecdaae0ab44a1d2 (diff) | |
download | sandcrawler-fd6dc7f36aecb6a303513476825cfe681500f02d.tar.gz sandcrawler-fd6dc7f36aecb6a303513476825cfe681500f02d.zip |
file ingest: don't 'backoff' on spn2 backoff error
The intent of this is to try and get through the daily ingest requests
faster, so we can loop and retry if needed. A 200 second delay, usually
resulting in a kafka topic reshuffle, really slows things down. This
will presumably result in a bunch of spn2-backoff status requests, but
we can just retry those.
-rw-r--r-- | python/sandcrawler/ia.py | 1 | ||||
-rw-r--r-- | python/sandcrawler/ingest_file.py | 7 |
2 files changed, 8 insertions, 0 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index d123c9f..641aa52 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -1057,6 +1057,7 @@ class SavePageNowClient: job_id = resp_json["job_id"] print(f" SPNv2 running: job_id={job_id} url={request_url}", file=sys.stderr) + time.sleep(0.1) # poll until complete final_json = None diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index 857a212..4ec37c1 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -20,6 +20,7 @@ from sandcrawler.ia import ( NoCaptureError, PetaboxError, ResourceResult, + SavePageNowBackoffError, SavePageNowClient, SavePageNowError, WaybackClient, @@ -632,6 +633,12 @@ class IngestFileWorker(SandcrawlerWorker): result["status"] = "spn2-error" result["error_message"] = str(e)[:1600] return result + except SavePageNowBackoffError as e: + result["status"] = "spn2-backoff" + result["error_message"] = str(e)[:1600] + # small sleep as a slow-down + time.sleep(2.0) + return result except PetaboxError as e: result["status"] = "petabox-error" result["error_message"] = str(e)[:1600] |