diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-05-16 15:02:02 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-05-16 15:02:02 -0700 |
commit | fcc5a1648d2e49e7002ca569ed668d3318a75584 (patch) | |
tree | 6ab5e044625b9d7de9a3596b1c212c259e68669e /python/sandcrawler/grobid.py | |
parent | 1534ff4d05c6fca460e82b5707fe3fbdc3504e50 (diff) | |
download | sandcrawler-fcc5a1648d2e49e7002ca569ed668d3318a75584.tar.gz sandcrawler-fcc5a1648d2e49e7002ca569ed668d3318a75584.zip |
ingest: catch more ConnectionErrors (SPN, replay fetch, GROBID)
Diffstat (limited to 'python/sandcrawler/grobid.py')
-rw-r--r-- | python/sandcrawler/grobid.py | 9 |
1 files changed, 9 insertions, 0 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index 1f957da..aa2c112 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -120,6 +120,15 @@ class GrobidClient(object): "status_code": -4, # heritrix3 "HTTP timeout" code "error_msg": "GROBID request (HTTP POST) timeout", } + except requests.exceptions.ConnectionError as ce: + # intentionally raising this, so workers crash when GROBID + # unavailable. but do add a sleep to slow things down. + print( + "GROBID ConnectionError. sleeping as a slow-down before crashing", + file=sys.stderr, + ) + time.sleep(5.0) + raise ce info: Dict[str, Any] = dict(status_code=grobid_response.status_code) if grobid_response.status_code == 200: |