From 761c45c51ba1ff4a1214bd9e048ea4d45110888b Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Wed, 3 Nov 2021 19:23:08 -0700
Subject: workers: use HTTP session for archive.org fetches

---
 python/sandcrawler/workers.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'python')

diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index bd7f36a..cca2f2c 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -7,7 +7,6 @@ import zipfile
 from collections import Counter
 from typing import Any, Dict, List, Optional, Sequence
 
-import requests
 from confluent_kafka import Consumer, KafkaException, Producer
 
 from .ia import (
@@ -17,7 +16,7 @@ from .ia import (
     WaybackContentError,
     WaybackError,
 )
-from .misc import parse_cdx_line
+from .misc import parse_cdx_line, requests_retry_session
 
 
 class SandcrawlerWorker(object):
@@ -127,6 +126,7 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
     def __init__(self, wayback_client: Optional[WaybackClient], **kwargs):
         super().__init__(**kwargs)
         self.wayback_client = wayback_client
+        self.http_session = requests_retry_session()
 
     def fetch_blob(self, record: Dict[str, Any]) -> Dict[str, Any]:
         default_key = record["sha1hex"]
@@ -173,7 +173,7 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
         elif record.get("item") and record.get("path"):
             # it's petabox link; fetch via HTTP
             start = time.time()
-            ia_resp = requests.get(
+            ia_resp = self.http_session.get(
                 "https://archive.org/serve/{}/{}".format(record["item"], record["path"])
             )
             petabox_sec = time.time() - start
-- 
cgit v1.2.3