diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-11-03 19:21:54 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-11-03 19:21:57 -0700 |
commit | 891299fd461b17c60fb48364cd5dca08c0711c32 (patch) | |
tree | c079448b1e5bfbd5d0acdf2baadc52d05b91352d | |
parent | 848556a64d13955c2978bad352f2e2cd9edb62d0 (diff) | |
download | sandcrawler-891299fd461b17c60fb48364cd5dca08c0711c32.tar.gz sandcrawler-891299fd461b17c60fb48364cd5dca08c0711c32.zip |
IA (wayback): actually use an HTTP session for replay fetches
I am embarassed this wasn't actually the case already! Looks like I had
even instantiated a session but wasn't using it.
Hopefully this change, which adds extra retries and better backoff
behavior, will improve sandcrawler ingest throughput.
-rw-r--r-- | python/sandcrawler/ia.py | 5 |
1 files changed, 3 insertions, 2 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 99a7f36..d334d24 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -395,6 +395,7 @@ class WaybackClient: self.replay_headers = { "User-Agent": "Mozilla/5.0 sandcrawler.WaybackClient", } + self.http_session = requests_retry_session() def fetch_petabox( self, csize: int, offset: int, warc_path: str, resolve_revisit: bool = True @@ -603,7 +604,7 @@ class WaybackClient: assert datetime.isdigit() try: - resp = requests.get( + resp = self.http_session.get( self.wayback_endpoint + datetime + "id_/" + url, allow_redirects=False, headers=self.replay_headers, @@ -670,7 +671,7 @@ class WaybackClient: assert datetime.isdigit() try: - resp = requests.get( + resp = self.http_session.get( self.wayback_endpoint + datetime + "id_/" + url, allow_redirects=False, headers=self.replay_headers, |