diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-11-03 19:23:08 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-11-03 19:23:08 -0700 |
commit | 761c45c51ba1ff4a1214bd9e048ea4d45110888b (patch) | |
tree | ec15da22a58c83abde0bc49e0278c6f78e93ea21 | |
parent | 891299fd461b17c60fb48364cd5dca08c0711c32 (diff) | |
download | sandcrawler-761c45c51ba1ff4a1214bd9e048ea4d45110888b.tar.gz sandcrawler-761c45c51ba1ff4a1214bd9e048ea4d45110888b.zip |
workers: use HTTP session for archive.org fetches
-rw-r--r-- | python/sandcrawler/workers.py | 6 |
1 files changed, 3 insertions, 3 deletions
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py index bd7f36a..cca2f2c 100644 --- a/python/sandcrawler/workers.py +++ b/python/sandcrawler/workers.py @@ -7,7 +7,6 @@ import zipfile from collections import Counter from typing import Any, Dict, List, Optional, Sequence -import requests from confluent_kafka import Consumer, KafkaException, Producer from .ia import ( @@ -17,7 +16,7 @@ from .ia import ( WaybackContentError, WaybackError, ) -from .misc import parse_cdx_line +from .misc import parse_cdx_line, requests_retry_session class SandcrawlerWorker(object): @@ -127,6 +126,7 @@ class SandcrawlerFetchWorker(SandcrawlerWorker): def __init__(self, wayback_client: Optional[WaybackClient], **kwargs): super().__init__(**kwargs) self.wayback_client = wayback_client + self.http_session = requests_retry_session() def fetch_blob(self, record: Dict[str, Any]) -> Dict[str, Any]: default_key = record["sha1hex"] @@ -173,7 +173,7 @@ class SandcrawlerFetchWorker(SandcrawlerWorker): elif record.get("item") and record.get("path"): # it's petabox link; fetch via HTTP start = time.time() - ia_resp = requests.get( + ia_resp = self.http_session.get( "https://archive.org/serve/{}/{}".format(record["item"], record["path"]) ) petabox_sec = time.time() - start |