diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-12-07 20:11:14 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-12-07 20:11:16 -0800 |
commit | 2fe319960996b560e6b20a8884cce63798c35792 (patch) | |
tree | 08bbf20c7b0986b8ef371505ce17688028e1ea15 /fatcat_scholar/sandcrawler.py | |
parent | 9e6ac281b73825c2ba79212f261b881b7f577a16 (diff) | |
download | fatcat-scholar-2fe319960996b560e6b20a8884cce63798c35792.tar.gz fatcat-scholar-2fe319960996b560e6b20a8884cce63798c35792.zip |
add requests session around postgrest fetches
This is expected to drastically improve throughput of intermediate
bundle generation, and reduce load on postgrest itself.
Diffstat (limited to 'fatcat_scholar/sandcrawler.py')
-rw-r--r-- | fatcat_scholar/sandcrawler.py | 37 |
1 files changed, 32 insertions, 5 deletions
diff --git a/fatcat_scholar/sandcrawler.py b/fatcat_scholar/sandcrawler.py index 5580841..207f240 100644 --- a/fatcat_scholar/sandcrawler.py +++ b/fatcat_scholar/sandcrawler.py @@ -1,15 +1,42 @@ -from typing import Any, Dict, Optional +from typing import Any, Dict, List, Optional import minio import requests +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error + + +def requests_retry_session( + retries: int = 2, + backoff_factor: int = 3, + status_forcelist: List[int] = [500, 502, 504], +) -> requests.Session: + """ + From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests + """ + session = requests.Session() + retry = Retry( + total=retries, + read=retries, + connect=retries, + backoff_factor=backoff_factor, + status_forcelist=status_forcelist, + ) + adapter = HTTPAdapter(max_retries=retry) + session.mount("http://", adapter) + session.mount("https://", adapter) + return session class SandcrawlerPostgrestClient: def __init__(self, api_url: str): self.api_url = api_url + self.session = requests_retry_session() def get_grobid(self, sha1: str) -> Optional[Dict[str, Any]]: - resp = requests.get(self.api_url + "/grobid", params=dict(sha1hex="eq." + sha1)) + resp = self.session.get( + self.api_url + "/grobid", params=dict(sha1hex="eq." + sha1) + ) resp.raise_for_status() resp_json = resp.json() if resp_json: @@ -18,7 +45,7 @@ class SandcrawlerPostgrestClient: return None def get_pdf_meta(self, sha1: str) -> Optional[Dict[str, Any]]: - resp = requests.get( + resp = self.session.get( self.api_url + "/pdf_meta", params=dict(sha1hex="eq." + sha1) ) resp.raise_for_status() @@ -29,7 +56,7 @@ class SandcrawlerPostgrestClient: return None def get_html_meta(self, sha1: str) -> Optional[Dict[str, Any]]: - resp = requests.get( + resp = self.session.get( self.api_url + "/html_meta", params=dict(sha1hex="eq." + sha1) ) resp.raise_for_status() @@ -40,7 +67,7 @@ class SandcrawlerPostgrestClient: return None def get_crossref_with_refs(self, doi: str) -> Optional[Dict[str, Any]]: - resp = requests.get( + resp = self.session.get( self.api_url + "/crossref_with_refs", params=dict(doi="eq." + doi) ) resp.raise_for_status() |