From 8e26ab264190b998e9035f0883f00340ca220822 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 30 Sep 2021 17:17:30 -0700 Subject: tune SPN CDX retry/wait depending on mode (priority vs daily) --- python/sandcrawler/ia.py | 6 ++++-- python/sandcrawler/ingest.py | 2 +- python/sandcrawler_worker.py | 4 ++++ 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 68b3466..a5d19cd 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -805,6 +805,8 @@ class SavePageNowClient: self.poll_count = 60 self.poll_seconds = 3.0 + self.spn_cdx_retry_sec = kwargs.get('spn_cdx_retry_sec', 9.0) + def save_url_now_v2(self, request_url, force_simple_get=0, capture_outlinks=0): """ Returns a "SavePageNowResult" (namedtuple) if SPN request was processed @@ -1023,7 +1025,7 @@ class SavePageNowClient: url=spn_result.terminal_url, datetime=spn_result.terminal_dt, filter_status_code=filter_status_code, - retry_sleep=9.0, + retry_sleep=self.spn_cdx_retry_sec, ) # sometimes there are fuzzy http/https self-redirects with the # same SURT; try to work around that @@ -1032,7 +1034,7 @@ class SavePageNowClient: url=spn_result.terminal_url, datetime=spn_result.terminal_dt, filter_status_code=200, - retry_sleep=9.0, + retry_sleep=self.spn_cdx_retry_sec, ) except KeyError as ke: print(" CDX KeyError: {}".format(ke), file=sys.stderr) diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 1d33b94..ba478ea 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -64,7 +64,7 @@ class IngestFileWorker(SandcrawlerWorker): self.wayback_client = WaybackClient() self.spn_client = kwargs.get('spn_client') if not self.spn_client: - self.spn_client = SavePageNowClient() + self.spn_client = SavePageNowClient(kwargs.get('spn_cdx_retry_sec', 9.0)) self.grobid_client = kwargs.get('grobid_client') if not self.grobid_client: self.grobid_client = GrobidClient() diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py index bd4ff67..8e275cf 100755 --- a/python/sandcrawler_worker.py +++ b/python/sandcrawler_worker.py @@ -201,13 +201,16 @@ def run_persist_pdftrio(args): pusher.run() def run_ingest_file(args): + spn_cdx_retry_sec = 9.0 if args.bulk: consume_group = "sandcrawler-{}-ingest-file-bulk".format(args.env) consume_topic = "sandcrawler-{}.ingest-file-requests-bulk".format(args.env) elif args.priority: + spn_cdx_retry_sec = 45.0 consume_group = "sandcrawler-{}-ingest-file-priority".format(args.env) consume_topic = "sandcrawler-{}.ingest-file-requests-priority".format(args.env) else: + spn_cdx_retry_sec = 1.0 consume_group = "sandcrawler-{}-ingest-file".format(args.env) consume_topic = "sandcrawler-{}.ingest-file-requests-daily".format(args.env) produce_topic = "sandcrawler-{}.ingest-file-results".format(args.env) @@ -253,6 +256,7 @@ def run_ingest_file(args): htmlteixml_sink=htmlteixml_sink, # don't SPNv2 for --bulk backfill try_spn2=not args.bulk, + spn_cdx_retry_sec=spn_cdx_retry_sec, ) pusher = KafkaJsonPusher( worker=worker, -- cgit v1.2.3