aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-12-07 20:11:14 -0800
committerBryan Newbold <bnewbold@archive.org>2021-12-07 20:11:16 -0800
commit2fe319960996b560e6b20a8884cce63798c35792 (patch)
tree08bbf20c7b0986b8ef371505ce17688028e1ea15
parent9e6ac281b73825c2ba79212f261b881b7f577a16 (diff)
downloadfatcat-scholar-2fe319960996b560e6b20a8884cce63798c35792.tar.gz
fatcat-scholar-2fe319960996b560e6b20a8884cce63798c35792.zip
add requests session around postgrest fetches
This is expected to drastically improve throughput of intermediate bundle generation, and reduce load on postgrest itself.
-rw-r--r--fatcat_scholar/query_fatcat.py27
-rw-r--r--fatcat_scholar/sandcrawler.py37
2 files changed, 34 insertions, 30 deletions
diff --git a/fatcat_scholar/query_fatcat.py b/fatcat_scholar/query_fatcat.py
index b63d834..3856f0f 100644
--- a/fatcat_scholar/query_fatcat.py
+++ b/fatcat_scholar/query_fatcat.py
@@ -2,35 +2,12 @@ import argparse
import json
import os
import sys
-from typing import Any, List
+from typing import Any
import elasticsearch
-import requests
from elasticsearch_dsl import Q, Search
-from requests.adapters import HTTPAdapter
-from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
-
-def requests_retry_session(
- retries: int = 2,
- backoff_factor: int = 3,
- status_forcelist: List[int] = [500, 502, 504],
-) -> requests.Session:
- """
- From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
- """
- session = requests.Session()
- retry = Retry(
- total=retries,
- read=retries,
- connect=retries,
- backoff_factor=backoff_factor,
- status_forcelist=status_forcelist,
- )
- adapter = HTTPAdapter(max_retries=retry)
- session.mount("http://", adapter)
- session.mount("https://", adapter)
- return session
+from fatcat_scholar.sandcrawler import requests_retry_session
def run_query_fatcat(query: str, fulltext_only: bool, json_output: Any) -> None:
diff --git a/fatcat_scholar/sandcrawler.py b/fatcat_scholar/sandcrawler.py
index 5580841..207f240 100644
--- a/fatcat_scholar/sandcrawler.py
+++ b/fatcat_scholar/sandcrawler.py
@@ -1,15 +1,42 @@
-from typing import Any, Dict, Optional
+from typing import Any, Dict, List, Optional
import minio
import requests
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
+
+
+def requests_retry_session(
+ retries: int = 2,
+ backoff_factor: int = 3,
+ status_forcelist: List[int] = [500, 502, 504],
+) -> requests.Session:
+ """
+ From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
+ """
+ session = requests.Session()
+ retry = Retry(
+ total=retries,
+ read=retries,
+ connect=retries,
+ backoff_factor=backoff_factor,
+ status_forcelist=status_forcelist,
+ )
+ adapter = HTTPAdapter(max_retries=retry)
+ session.mount("http://", adapter)
+ session.mount("https://", adapter)
+ return session
class SandcrawlerPostgrestClient:
def __init__(self, api_url: str):
self.api_url = api_url
+ self.session = requests_retry_session()
def get_grobid(self, sha1: str) -> Optional[Dict[str, Any]]:
- resp = requests.get(self.api_url + "/grobid", params=dict(sha1hex="eq." + sha1))
+ resp = self.session.get(
+ self.api_url + "/grobid", params=dict(sha1hex="eq." + sha1)
+ )
resp.raise_for_status()
resp_json = resp.json()
if resp_json:
@@ -18,7 +45,7 @@ class SandcrawlerPostgrestClient:
return None
def get_pdf_meta(self, sha1: str) -> Optional[Dict[str, Any]]:
- resp = requests.get(
+ resp = self.session.get(
self.api_url + "/pdf_meta", params=dict(sha1hex="eq." + sha1)
)
resp.raise_for_status()
@@ -29,7 +56,7 @@ class SandcrawlerPostgrestClient:
return None
def get_html_meta(self, sha1: str) -> Optional[Dict[str, Any]]:
- resp = requests.get(
+ resp = self.session.get(
self.api_url + "/html_meta", params=dict(sha1hex="eq." + sha1)
)
resp.raise_for_status()
@@ -40,7 +67,7 @@ class SandcrawlerPostgrestClient:
return None
def get_crossref_with_refs(self, doi: str) -> Optional[Dict[str, Any]]:
- resp = requests.get(
+ resp = self.session.get(
self.api_url + "/crossref_with_refs", params=dict(doi="eq." + doi)
)
resp.raise_for_status()