aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-24 23:38:57 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-24 23:38:57 -0800
commitff51a5d02fb6ab142d95eaf408a5f28e9b5f0507 (patch)
tree35c48c4ad71e459fea10475f1b055e83a0533c44 /python
parentdd2afc1b8bf457ee15150359e00369bc995de19e (diff)
downloadsandcrawler-ff51a5d02fb6ab142d95eaf408a5f28e9b5f0507.tar.gz
sandcrawler-ff51a5d02fb6ab142d95eaf408a5f28e9b5f0507.zip
pdftrio: tweaks to avoid connection errors
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/pdftrio.py10
1 files changed, 9 insertions, 1 deletions
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
index 12be9eb..41eed19 100644
--- a/python/sandcrawler/pdftrio.py
+++ b/python/sandcrawler/pdftrio.py
@@ -34,7 +34,7 @@ class PdfTrioClient(object):
files={
'pdf_content': blob,
},
- timeout=30.0,
+ timeout=60.0,
)
except requests.Timeout:
return {
@@ -42,6 +42,14 @@ class PdfTrioClient(object):
'status_code': -4, # heritrix3 "HTTP timeout" code
'error_msg': 'pdftrio request (HTTP POST) timeout',
}
+ except requests.exceptions.ConnectionError:
+ # crude back-off
+ time.sleep(2.0)
+ return {
+ 'status': 'error-connect',
+ 'status_code': -2, # heritrix3 "HTTP connect" code
+ 'error_msg': 'pdftrio request connection timout',
+ }
info = dict(
status_code=pdftrio_response.status_code,