diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-02-24 23:38:57 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-24 23:38:57 -0800 |
commit | ff51a5d02fb6ab142d95eaf408a5f28e9b5f0507 (patch) | |
tree | 35c48c4ad71e459fea10475f1b055e83a0533c44 /python | |
parent | dd2afc1b8bf457ee15150359e00369bc995de19e (diff) | |
download | sandcrawler-ff51a5d02fb6ab142d95eaf408a5f28e9b5f0507.tar.gz sandcrawler-ff51a5d02fb6ab142d95eaf408a5f28e9b5f0507.zip |
pdftrio: tweaks to avoid connection errors
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/pdftrio.py | 10 |
1 files changed, 9 insertions, 1 deletions
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py index 12be9eb..41eed19 100644 --- a/python/sandcrawler/pdftrio.py +++ b/python/sandcrawler/pdftrio.py @@ -34,7 +34,7 @@ class PdfTrioClient(object): files={ 'pdf_content': blob, }, - timeout=30.0, + timeout=60.0, ) except requests.Timeout: return { @@ -42,6 +42,14 @@ class PdfTrioClient(object): 'status_code': -4, # heritrix3 "HTTP timeout" code 'error_msg': 'pdftrio request (HTTP POST) timeout', } + except requests.exceptions.ConnectionError: + # crude back-off + time.sleep(2.0) + return { + 'status': 'error-connect', + 'status_code': -2, # heritrix3 "HTTP connect" code + 'error_msg': 'pdftrio request connection timout', + } info = dict( status_code=pdftrio_response.status_code, |