From 8379d65f1b159cbb8e8a1abce5fb05a3f013a10e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 3 Nov 2021 19:19:42 -0700 Subject: grobid: use requests session This should fix an embarassing bug with exhausting local ports: requests.exceptions.ConnectionError: HTTPConnectionPool(host='wbgrp-svc096.us.archive.org', port=8070): Max retries exceeded with url: /api/processCitationList (Caused by NewConnectionError(': Failed to establish a new connection: [Errno 99] Cannot assign requested address')) --- python/sandcrawler/grobid.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'python') diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index 8ed6d7e..b9dd196 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -8,7 +8,7 @@ import requests from grobid_tei_xml import GrobidBiblio, parse_citation_list_xml, parse_document_xml from .ia import WaybackClient -from .misc import gen_file_metadata +from .misc import gen_file_metadata, requests_retry_session from .workers import SandcrawlerFetchWorker, SandcrawlerWorker @@ -72,6 +72,7 @@ class GrobidClient(object): def __init__(self, host_url: str = "https://grobid.qa.fatcat.wiki", **kwargs): self.host_url = host_url self.consolidate_mode = int(kwargs.get("consolidate_mode", 0)) + self.session = requests_retry_session() def process_fulltext( self, blob: bytes, consolidate_mode: Optional[int] = None @@ -92,7 +93,7 @@ class GrobidClient(object): assert consolidate_mode is not None try: - grobid_response = requests.post( + grobid_response = self.session.post( self.host_url + "/api/processFulltextDocument", files={ "input": blob, @@ -134,7 +135,7 @@ class GrobidClient(object): raise ValueError("more than 5,000 references in a batch is just too much") try: - grobid_response = requests.post( + grobid_response = self.session.post( self.host_url + "/api/processCitationList", data={ "citations": unstructured_list, -- cgit v1.2.3