diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-02-24 23:38:57 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-24 23:38:57 -0800 | 
| commit | ff51a5d02fb6ab142d95eaf408a5f28e9b5f0507 (patch) | |
| tree | 35c48c4ad71e459fea10475f1b055e83a0533c44 /python | |
| parent | dd2afc1b8bf457ee15150359e00369bc995de19e (diff) | |
| download | sandcrawler-ff51a5d02fb6ab142d95eaf408a5f28e9b5f0507.tar.gz sandcrawler-ff51a5d02fb6ab142d95eaf408a5f28e9b5f0507.zip  | |
pdftrio: tweaks to avoid connection errors
Diffstat (limited to 'python')
| -rw-r--r-- | python/sandcrawler/pdftrio.py | 10 | 
1 files changed, 9 insertions, 1 deletions
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py index 12be9eb..41eed19 100644 --- a/python/sandcrawler/pdftrio.py +++ b/python/sandcrawler/pdftrio.py @@ -34,7 +34,7 @@ class PdfTrioClient(object):                  files={                      'pdf_content': blob,                  }, -                timeout=30.0, +                timeout=60.0,              )          except requests.Timeout:              return { @@ -42,6 +42,14 @@ class PdfTrioClient(object):                  'status_code': -4,  # heritrix3 "HTTP timeout" code                  'error_msg': 'pdftrio request (HTTP POST) timeout',              } +        except requests.exceptions.ConnectionError: +            # crude back-off +            time.sleep(2.0) +            return { +                'status': 'error-connect', +                'status_code': -2,  # heritrix3 "HTTP connect" code +                'error_msg': 'pdftrio request connection timout', +            }          info = dict(              status_code=pdftrio_response.status_code,  | 
