diff options
-rw-r--r-- | python/sandcrawler/pdftrio.py | 10 |
1 files changed, 9 insertions, 1 deletions
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py index 12be9eb..41eed19 100644 --- a/python/sandcrawler/pdftrio.py +++ b/python/sandcrawler/pdftrio.py @@ -34,7 +34,7 @@ class PdfTrioClient(object): files={ 'pdf_content': blob, }, - timeout=30.0, + timeout=60.0, ) except requests.Timeout: return { @@ -42,6 +42,14 @@ class PdfTrioClient(object): 'status_code': -4, # heritrix3 "HTTP timeout" code 'error_msg': 'pdftrio request (HTTP POST) timeout', } + except requests.exceptions.ConnectionError: + # crude back-off + time.sleep(2.0) + return { + 'status': 'error-connect', + 'status_code': -2, # heritrix3 "HTTP connect" code + 'error_msg': 'pdftrio request connection timout', + } info = dict( status_code=pdftrio_response.status_code, |