diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-04-26 15:25:20 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-04-26 15:25:23 -0700 |
commit | 8f9240fed272367669b535b1334e280c588a1791 (patch) | |
tree | f2eaf2c0ff81290b482dba31ef1df650dac3b23d | |
parent | f8f41070adfd4dc4856d96f618ee96fe8c411458 (diff) | |
download | sandcrawler-8f9240fed272367669b535b1334e280c588a1791.tar.gz sandcrawler-8f9240fed272367669b535b1334e280c588a1791.zip |
SPNv2: several fixes for prod throughput
Most importantly, for some API flags, if the value is not true-thy, do
not set the flag at all. Setting any flag was resulting in screenshots
and outlinks actually getting created/captured, which was a huge
slowdown.
Also, check per-user SPNv2 slots available, using API, before requesting
an actual capture.
-rw-r--r-- | python/sandcrawler/ia.py | 45 |
1 files changed, 34 insertions, 11 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 641aa52..9c727ce 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -1010,19 +1010,41 @@ class SavePageNowClient: if domain in request_url: force_simple_get = 1 break + + # check if SPNv2 user has capacity available + resp = self.v2_session.get("https://web.archive.org/save/status/user") + if resp.status_code == 429: + raise SavePageNowBackoffError( + f"SPNv2 availability API status_code: {resp.status_code}" + ) + elif resp.status_code != 200: + raise SavePageNowError(f"SPN2 availability status_code: {resp.status_code}") + resp.raise_for_status() + status_user = resp.json() + if status_user["available"] <= 1: + print(f"SPNv2 user slots not available: {resp.text}", file=sys.stderr) + raise SavePageNowBackoffError( + "SPNv2 availability: {}, url: {}".format(status_user, request_url) + ) + + req_data = { + "url": request_url, + "capture_all": 1, + "if_not_archived_within": "1d", + "skip_first_archive": 1, + "js_behavior_timeout": 0, + # NOTE: not set explicitly to 0/false because of a bug in SPNv2 API + # implementation + # "capture_screenshot": 0, + # "outlinks_availability": 0, + } + if force_simple_get: + req_data["force_get"] = force_simple_get + if capture_outlinks: + req_data["capture_outlinks"] = capture_outlinks resp = self.v2_session.post( self.v2endpoint, - data={ - "url": request_url, - "capture_all": 1, - "capture_outlinks": capture_outlinks, - "capture_screenshot": 0, - "if_not_archived_within": "1d", - "force_get": force_simple_get, - "skip_first_archive": 1, - "outlinks_availability": 0, - "js_behavior_timeout": 0, - }, + data=req_data, ) if resp.status_code == 429: raise SavePageNowBackoffError( @@ -1032,6 +1054,7 @@ class SavePageNowClient: raise SavePageNowError( "SPN2 status_code: {}, url: {}".format(resp.status_code, request_url) ) + resp.raise_for_status() resp_json = resp.json() if ( |