aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-08-11 17:07:07 -0700
committerBryan Newbold <bnewbold@archive.org>2020-08-11 17:07:09 -0700
commit5c7f9bc60b372006adac8e47ee2f4f1f73b84897 (patch)
tree521d771057a71b40b4ebb8a85a16da2c568f4269 /python
parent33cc50939619d1c30bdfa800aba2137397a7ee0d (diff)
downloadsandcrawler-5c7f9bc60b372006adac8e47ee2f4f1f73b84897.tar.gz
sandcrawler-5c7f9bc60b372006adac8e47ee2f4f1f73b84897.zip
refactor: force_get -> force_simple_get
For clarity. The SPNv2 API hasn't changed, just changing the variable/parameter name.
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ia.py10
-rw-r--r--python/sandcrawler/ingest.py6
2 files changed, 8 insertions, 8 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 150de53..7b623bc 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -769,7 +769,7 @@ class SavePageNowClient:
self.poll_count = 60
self.poll_seconds = 3.0
- def save_url_now_v2(self, request_url, force_get=0, capture_outlinks=0):
+ def save_url_now_v2(self, request_url, force_simple_get=0, capture_outlinks=0):
"""
Returns a "SavePageNowResult" (namedtuple) if SPN request was processed
at all, or raises an exception if there was an error with SPN itself.
@@ -811,7 +811,7 @@ class SavePageNowClient:
'capture_outlinks': capture_outlinks,
'capture_screenshot': 0,
'if_not_archived_within': '1d',
- 'force_get': force_get,
+ 'force_get': force_simple_get,
'skip_first_archive': 1,
'outlinks_availability': 0,
'js_behavior_timeout': 0,
@@ -886,7 +886,7 @@ class SavePageNowClient:
None,
)
- def crawl_resource(self, start_url, wayback_client, force_get=0):
+ def crawl_resource(self, start_url, wayback_client, force_simple_get=0):
"""
Runs a SPN2 crawl, then fetches body from wayback.
@@ -895,9 +895,9 @@ class SavePageNowClient:
# HACK: capture CNKI domains with outlinks (for COVID-19 crawling)
if 'gzbd.cnki.net/' in start_url:
- spn_result = self.save_url_now_v2(start_url, force_get=force_get, capture_outlinks=1)
+ spn_result = self.save_url_now_v2(start_url, force_simple_get=force_simple_get, capture_outlinks=1)
else:
- spn_result = self.save_url_now_v2(start_url, force_get=force_get)
+ spn_result = self.save_url_now_v2(start_url, force_simple_get=force_simple_get)
if not spn_result.success:
status = spn_result.status
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 58f3783..263b9d5 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -156,12 +156,12 @@ class IngestFileWorker(SandcrawlerWorker):
if self.try_spn2 and (not resource or not resource.hit or soft404):
via = "spn2"
- force_get = 0
+ force_simple_get = 0
for domain in self.spn2_simple_get_domains:
if domain in url:
- force_get = 1
+ force_simple_get = 1
break
- resource = self.spn_client.crawl_resource(url, self.wayback_client, force_get=force_get)
+ resource = self.spn_client.crawl_resource(url, self.wayback_client, force_simple_get=force_simple_get)
print("[FETCH {}\t] {}\t{}".format(
via,
resource.status,