aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-11-03 19:23:08 -0700
committerBryan Newbold <bnewbold@archive.org>2021-11-03 19:23:08 -0700
commit761c45c51ba1ff4a1214bd9e048ea4d45110888b (patch)
treeec15da22a58c83abde0bc49e0278c6f78e93ea21 /python
parent891299fd461b17c60fb48364cd5dca08c0711c32 (diff)
downloadsandcrawler-761c45c51ba1ff4a1214bd9e048ea4d45110888b.tar.gz
sandcrawler-761c45c51ba1ff4a1214bd9e048ea4d45110888b.zip
workers: use HTTP session for archive.org fetches
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/workers.py6
1 files changed, 3 insertions, 3 deletions
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index bd7f36a..cca2f2c 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -7,7 +7,6 @@ import zipfile
from collections import Counter
from typing import Any, Dict, List, Optional, Sequence
-import requests
from confluent_kafka import Consumer, KafkaException, Producer
from .ia import (
@@ -17,7 +16,7 @@ from .ia import (
WaybackContentError,
WaybackError,
)
-from .misc import parse_cdx_line
+from .misc import parse_cdx_line, requests_retry_session
class SandcrawlerWorker(object):
@@ -127,6 +126,7 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
def __init__(self, wayback_client: Optional[WaybackClient], **kwargs):
super().__init__(**kwargs)
self.wayback_client = wayback_client
+ self.http_session = requests_retry_session()
def fetch_blob(self, record: Dict[str, Any]) -> Dict[str, Any]:
default_key = record["sha1hex"]
@@ -173,7 +173,7 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
elif record.get("item") and record.get("path"):
# it's petabox link; fetch via HTTP
start = time.time()
- ia_resp = requests.get(
+ ia_resp = self.http_session.get(
"https://archive.org/serve/{}/{}".format(record["item"], record["path"])
)
petabox_sec = time.time() - start