From ff51a5d02fb6ab142d95eaf408a5f28e9b5f0507 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 24 Feb 2020 23:38:57 -0800 Subject: pdftrio: tweaks to avoid connection errors --- python/sandcrawler/pdftrio.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'python') diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py index 12be9eb..41eed19 100644 --- a/python/sandcrawler/pdftrio.py +++ b/python/sandcrawler/pdftrio.py @@ -34,7 +34,7 @@ class PdfTrioClient(object): files={ 'pdf_content': blob, }, - timeout=30.0, + timeout=60.0, ) except requests.Timeout: return { @@ -42,6 +42,14 @@ class PdfTrioClient(object): 'status_code': -4, # heritrix3 "HTTP timeout" code 'error_msg': 'pdftrio request (HTTP POST) timeout', } + except requests.exceptions.ConnectionError: + # crude back-off + time.sleep(2.0) + return { + 'status': 'error-connect', + 'status_code': -2, # heritrix3 "HTTP connect" code + 'error_msg': 'pdftrio request connection timout', + } info = dict( status_code=pdftrio_response.status_code, -- cgit v1.2.3