grobid: use requests session

This should fix an embarassing bug with exhausting local ports: requests.exceptions.ConnectionError: HTTPConnectionPool(host='wbgrp-svc096.us.archive.org', port=8070): Max retries exceeded with url: /api/processCitationList (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f8dfc24e250>: Failed to establish a new connection: [Errno 99] Cannot assign requested address'))
author: Bryan Newbold <bnewbold@archive.org> 2021-11-03 19:19:42 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-11-04 17:19:52 -0700
commit: 8379d65f1b159cbb8e8a1abce5fb05a3f013a10e (patch)
tree: 10d7bb413e944d95f9c50656e776afe5f75799da
parent: b4ceb130504cacbb75549e46719159f4e5ab5c51 (diff)
download: sandcrawler-8379d65f1b159cbb8e8a1abce5fb05a3f013a10e.tar.gz
sandcrawler-8379d65f1b159cbb8e8a1abce5fb05a3f013a10e.zip
1 files changed, 4 insertions, 3 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 8ed6d7e..b9dd196 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -8,7 +8,7 @@ import requests
 from grobid_tei_xml import GrobidBiblio, parse_citation_list_xml, parse_document_xml
 
 from .ia import WaybackClient
-from .misc import gen_file_metadata
+from .misc import gen_file_metadata, requests_retry_session
 from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
 
 
@@ -72,6 +72,7 @@ class GrobidClient(object):
     def __init__(self, host_url: str = "https://grobid.qa.fatcat.wiki", **kwargs):
         self.host_url = host_url
         self.consolidate_mode = int(kwargs.get("consolidate_mode", 0))
+        self.session = requests_retry_session()
 
     def process_fulltext(
         self, blob: bytes, consolidate_mode: Optional[int] = None
@@ -92,7 +93,7 @@ class GrobidClient(object):
         assert consolidate_mode is not None
 
         try:
-            grobid_response = requests.post(
+            grobid_response = self.session.post(
                 self.host_url + "/api/processFulltextDocument",
                 files={
                     "input": blob,
@@ -134,7 +135,7 @@ class GrobidClient(object):
             raise ValueError("more than 5,000 references in a batch is just too much")
 
         try:
-            grobid_response = requests.post(
+            grobid_response = self.session.post(
                 self.host_url + "/api/processCitationList",
                 data={
                     "citations": unstructured_list,
author	Bryan Newbold <bnewbold@archive.org>	2021-11-03 19:19:42 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-11-04 17:19:52 -0700
commit	8379d65f1b159cbb8e8a1abce5fb05a3f013a10e (patch)
tree	10d7bb413e944d95f9c50656e776afe5f75799da
parent	b4ceb130504cacbb75549e46719159f4e5ab5c51 (diff)
download	sandcrawler-8379d65f1b159cbb8e8a1abce5fb05a3f013a10e.tar.gz sandcrawler-8379d65f1b159cbb8e8a1abce5fb05a3f013a10e.zip