aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-04-26 15:25:20 -0700
committerBryan Newbold <bnewbold@archive.org>2022-04-26 15:25:23 -0700
commit8f9240fed272367669b535b1334e280c588a1791 (patch)
treef2eaf2c0ff81290b482dba31ef1df650dac3b23d
parentf8f41070adfd4dc4856d96f618ee96fe8c411458 (diff)
downloadsandcrawler-8f9240fed272367669b535b1334e280c588a1791.tar.gz
sandcrawler-8f9240fed272367669b535b1334e280c588a1791.zip
SPNv2: several fixes for prod throughput
Most importantly, for some API flags, if the value is not true-thy, do not set the flag at all. Setting any flag was resulting in screenshots and outlinks actually getting created/captured, which was a huge slowdown. Also, check per-user SPNv2 slots available, using API, before requesting an actual capture.
-rw-r--r--python/sandcrawler/ia.py45
1 files changed, 34 insertions, 11 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 641aa52..9c727ce 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -1010,19 +1010,41 @@ class SavePageNowClient:
if domain in request_url:
force_simple_get = 1
break
+
+ # check if SPNv2 user has capacity available
+ resp = self.v2_session.get("https://web.archive.org/save/status/user")
+ if resp.status_code == 429:
+ raise SavePageNowBackoffError(
+ f"SPNv2 availability API status_code: {resp.status_code}"
+ )
+ elif resp.status_code != 200:
+ raise SavePageNowError(f"SPN2 availability status_code: {resp.status_code}")
+ resp.raise_for_status()
+ status_user = resp.json()
+ if status_user["available"] <= 1:
+ print(f"SPNv2 user slots not available: {resp.text}", file=sys.stderr)
+ raise SavePageNowBackoffError(
+ "SPNv2 availability: {}, url: {}".format(status_user, request_url)
+ )
+
+ req_data = {
+ "url": request_url,
+ "capture_all": 1,
+ "if_not_archived_within": "1d",
+ "skip_first_archive": 1,
+ "js_behavior_timeout": 0,
+ # NOTE: not set explicitly to 0/false because of a bug in SPNv2 API
+ # implementation
+ # "capture_screenshot": 0,
+ # "outlinks_availability": 0,
+ }
+ if force_simple_get:
+ req_data["force_get"] = force_simple_get
+ if capture_outlinks:
+ req_data["capture_outlinks"] = capture_outlinks
resp = self.v2_session.post(
self.v2endpoint,
- data={
- "url": request_url,
- "capture_all": 1,
- "capture_outlinks": capture_outlinks,
- "capture_screenshot": 0,
- "if_not_archived_within": "1d",
- "force_get": force_simple_get,
- "skip_first_archive": 1,
- "outlinks_availability": 0,
- "js_behavior_timeout": 0,
- },
+ data=req_data,
)
if resp.status_code == 429:
raise SavePageNowBackoffError(
@@ -1032,6 +1054,7 @@ class SavePageNowClient:
raise SavePageNowError(
"SPN2 status_code: {}, url: {}".format(resp.status_code, request_url)
)
+ resp.raise_for_status()
resp_json = resp.json()
if (