aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/db.py36
1 files changed, 24 insertions, 12 deletions
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index 101d419..f9018ec 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -4,20 +4,24 @@ from typing import Any, Dict, List, Optional, Tuple
import psycopg2
import psycopg2.extras
-import requests
+
+from .misc import requests_retry_session
class SandcrawlerPostgrestClient:
def __init__(self, api_url: str = "http://wbgrp-svc506.us.archive.org:3030", **kwargs):
self.api_url = api_url
+ self.http_session = requests_retry_session()
def get_cdx(self, url: str) -> Optional[dict]:
- resp = requests.get(self.api_url + "/cdx", params=dict(url="eq." + url))
+ resp = self.http_session.get(self.api_url + "/cdx", params=dict(url="eq." + url))
resp.raise_for_status()
return resp.json() or None
def get_grobid(self, sha1: str) -> Optional[dict]:
- resp = requests.get(self.api_url + "/grobid", params=dict(sha1hex="eq." + sha1))
+ resp = self.http_session.get(
+ self.api_url + "/grobid", params=dict(sha1hex="eq." + sha1)
+ )
resp.raise_for_status()
resp_json = resp.json()
if resp_json:
@@ -26,7 +30,9 @@ class SandcrawlerPostgrestClient:
return None
def get_pdftrio(self, sha1: str) -> Optional[dict]:
- resp = requests.get(self.api_url + "/pdftrio", params=dict(sha1hex="eq." + sha1))
+ resp = self.http_session.get(
+ self.api_url + "/pdftrio", params=dict(sha1hex="eq." + sha1)
+ )
resp.raise_for_status()
resp_json = resp.json()
if resp_json:
@@ -35,7 +41,9 @@ class SandcrawlerPostgrestClient:
return None
def get_pdf_meta(self, sha1: str) -> Optional[dict]:
- resp = requests.get(self.api_url + "/pdf_meta", params=dict(sha1hex="eq." + sha1))
+ resp = self.http_session.get(
+ self.api_url + "/pdf_meta", params=dict(sha1hex="eq." + sha1)
+ )
resp.raise_for_status()
resp_json = resp.json()
if resp_json:
@@ -44,7 +52,7 @@ class SandcrawlerPostgrestClient:
return None
def get_html_meta(self, sha1hex: str) -> Optional[dict]:
- resp = requests.get(
+ resp = self.http_session.get(
self.api_url + "/html_meta",
params=dict(sha1hex=f"eq.{sha1hex}"),
)
@@ -56,7 +64,9 @@ class SandcrawlerPostgrestClient:
return None
def get_file_meta(self, sha1: str) -> Optional[dict]:
- resp = requests.get(self.api_url + "/file_meta", params=dict(sha1hex="eq." + sha1))
+ resp = self.http_session.get(
+ self.api_url + "/file_meta", params=dict(sha1hex="eq." + sha1)
+ )
resp.raise_for_status()
resp_json = resp.json()
if resp_json:
@@ -65,7 +75,7 @@ class SandcrawlerPostgrestClient:
return None
def get_ingest_file_result(self, ingest_type: str, url: str) -> Optional[dict]:
- resp = requests.get(
+ resp = self.http_session.get(
self.api_url + "/ingest_file_result",
params=dict(ingest_type=f"eq.{ingest_type}", base_url=f"eq.{url}"),
)
@@ -77,7 +87,7 @@ class SandcrawlerPostgrestClient:
return None
def get_ingest_fileset_platform(self, ingest_type: str, url: str) -> Optional[dict]:
- resp = requests.get(
+ resp = self.http_session.get(
self.api_url + "/ingest_fileset_platform",
params=dict(ingest_type=f"eq.{ingest_type}", base_url=f"eq.{url}"),
)
@@ -89,7 +99,7 @@ class SandcrawlerPostgrestClient:
return None
def get_crossref(self, doi: str) -> Optional[dict]:
- resp = requests.get(self.api_url + "/crossref", params=dict(doi=f"eq.{doi}"))
+ resp = self.http_session.get(self.api_url + "/crossref", params=dict(doi=f"eq.{doi}"))
resp.raise_for_status()
resp_json = resp.json()
if resp_json:
@@ -98,7 +108,9 @@ class SandcrawlerPostgrestClient:
return None
def get_crossref_with_refs(self, doi: str) -> Optional[dict]:
- resp = requests.get(self.api_url + "/crossref_with_refs", params=dict(doi=f"eq.{doi}"))
+ resp = self.http_session.get(
+ self.api_url + "/crossref_with_refs", params=dict(doi=f"eq.{doi}")
+ )
resp.raise_for_status()
resp_json = resp.json()
if resp_json:
@@ -107,7 +119,7 @@ class SandcrawlerPostgrestClient:
return None
def get_grobid_refs(self, source: str, source_id: str) -> Optional[dict]:
- resp = requests.get(
+ resp = self.http_session.get(
self.api_url + "/grobid_refs",
params=dict(source=f"eq.{source}", source_id=f"eq.{source_id}"),
)