From fd6dc7f36aecb6a303513476825cfe681500f02d Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 22 Mar 2022 15:57:12 -0700 Subject: file ingest: don't 'backoff' on spn2 backoff error The intent of this is to try and get through the daily ingest requests faster, so we can loop and retry if needed. A 200 second delay, usually resulting in a kafka topic reshuffle, really slows things down. This will presumably result in a bunch of spn2-backoff status requests, but we can just retry those. --- python/sandcrawler/ia.py | 1 + python/sandcrawler/ingest_file.py | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index d123c9f..641aa52 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -1057,6 +1057,7 @@ class SavePageNowClient: job_id = resp_json["job_id"] print(f" SPNv2 running: job_id={job_id} url={request_url}", file=sys.stderr) + time.sleep(0.1) # poll until complete final_json = None diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index 857a212..4ec37c1 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -20,6 +20,7 @@ from sandcrawler.ia import ( NoCaptureError, PetaboxError, ResourceResult, + SavePageNowBackoffError, SavePageNowClient, SavePageNowError, WaybackClient, @@ -632,6 +633,12 @@ class IngestFileWorker(SandcrawlerWorker): result["status"] = "spn2-error" result["error_message"] = str(e)[:1600] return result + except SavePageNowBackoffError as e: + result["status"] = "spn2-backoff" + result["error_message"] = str(e)[:1600] + # small sleep as a slow-down + time.sleep(2.0) + return result except PetaboxError as e: result["status"] = "petabox-error" result["error_message"] = str(e)[:1600] -- cgit v1.2.3