From 5c7f9bc60b372006adac8e47ee2f4f1f73b84897 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 11 Aug 2020 17:07:07 -0700 Subject: refactor: force_get -> force_simple_get For clarity. The SPNv2 API hasn't changed, just changing the variable/parameter name. --- python/sandcrawler/ia.py | 10 +++++----- python/sandcrawler/ingest.py | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'python') diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 150de53..7b623bc 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -769,7 +769,7 @@ class SavePageNowClient: self.poll_count = 60 self.poll_seconds = 3.0 - def save_url_now_v2(self, request_url, force_get=0, capture_outlinks=0): + def save_url_now_v2(self, request_url, force_simple_get=0, capture_outlinks=0): """ Returns a "SavePageNowResult" (namedtuple) if SPN request was processed at all, or raises an exception if there was an error with SPN itself. @@ -811,7 +811,7 @@ class SavePageNowClient: 'capture_outlinks': capture_outlinks, 'capture_screenshot': 0, 'if_not_archived_within': '1d', - 'force_get': force_get, + 'force_get': force_simple_get, 'skip_first_archive': 1, 'outlinks_availability': 0, 'js_behavior_timeout': 0, @@ -886,7 +886,7 @@ class SavePageNowClient: None, ) - def crawl_resource(self, start_url, wayback_client, force_get=0): + def crawl_resource(self, start_url, wayback_client, force_simple_get=0): """ Runs a SPN2 crawl, then fetches body from wayback. @@ -895,9 +895,9 @@ class SavePageNowClient: # HACK: capture CNKI domains with outlinks (for COVID-19 crawling) if 'gzbd.cnki.net/' in start_url: - spn_result = self.save_url_now_v2(start_url, force_get=force_get, capture_outlinks=1) + spn_result = self.save_url_now_v2(start_url, force_simple_get=force_simple_get, capture_outlinks=1) else: - spn_result = self.save_url_now_v2(start_url, force_get=force_get) + spn_result = self.save_url_now_v2(start_url, force_simple_get=force_simple_get) if not spn_result.success: status = spn_result.status diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 58f3783..263b9d5 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -156,12 +156,12 @@ class IngestFileWorker(SandcrawlerWorker): if self.try_spn2 and (not resource or not resource.hit or soft404): via = "spn2" - force_get = 0 + force_simple_get = 0 for domain in self.spn2_simple_get_domains: if domain in url: - force_get = 1 + force_simple_get = 1 break - resource = self.spn_client.crawl_resource(url, self.wayback_client, force_get=force_get) + resource = self.spn_client.crawl_resource(url, self.wayback_client, force_simple_get=force_simple_get) print("[FETCH {}\t] {}\t{}".format( via, resource.status, -- cgit v1.2.3