aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ia.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-03-18 18:49:05 -0700
committerBryan Newbold <bnewbold@archive.org>2020-03-18 18:49:09 -0700
commitcb16d18137c936a634b75bf0eb6acb43c77d9290 (patch)
tree4b8b72aa7cd1d5a9da81c6233ea10b6cdc837d2a /python/sandcrawler/ia.py
parente1b3edd7af59fe0fd4272a4696387ea09a22a6c0 (diff)
downloadsandcrawler-cb16d18137c936a634b75bf0eb6acb43c77d9290.tar.gz
sandcrawler-cb16d18137c936a634b75bf0eb6acb43c77d9290.zip
implement (unused) force_get flag for SPN2
I hoped this feature would make it possible to crawl journals.lww.com PDFs, because the token URLs work with `wget`, but it still doesn't seem to work. Maybe because of user agent? Anyways, this feature might be useful for crawling efficiency, so adding to master.
Diffstat (limited to 'python/sandcrawler/ia.py')
-rw-r--r--python/sandcrawler/ia.py7
1 files changed, 4 insertions, 3 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index e31ff30..0a0e0ae 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -758,7 +758,7 @@ class SavePageNowClient:
self.poll_count = 60
self.poll_seconds = 3.0
- def save_url_now_v2(self, request_url):
+ def save_url_now_v2(self, request_url, force_get=0):
"""
Returns a "SavePageNowResult" (namedtuple) if SPN request was processed
at all, or raises an exception if there was an error with SPN itself.
@@ -797,6 +797,7 @@ class SavePageNowClient:
'capture_all': 1,
'capture_screenshot': 0,
'if_not_archived_within': '1d',
+ 'force_get': force_get,
},
)
if resp.status_code == 429:
@@ -866,14 +867,14 @@ class SavePageNowClient:
None,
)
- def crawl_resource(self, start_url, wayback_client):
+ def crawl_resource(self, start_url, wayback_client, force_get=0):
"""
Runs a SPN2 crawl, then fetches body from wayback.
TODO: possible to fetch from petabox?
"""
- spn_result = self.save_url_now_v2(start_url)
+ spn_result = self.save_url_now_v2(start_url, force_get=force_get)
if not spn_result.success:
status = spn_result.status