aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/grobid.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-05-16 15:02:02 -0700
committerBryan Newbold <bnewbold@archive.org>2022-05-16 15:02:02 -0700
commitfcc5a1648d2e49e7002ca569ed668d3318a75584 (patch)
tree6ab5e044625b9d7de9a3596b1c212c259e68669e /python/sandcrawler/grobid.py
parent1534ff4d05c6fca460e82b5707fe3fbdc3504e50 (diff)
downloadsandcrawler-fcc5a1648d2e49e7002ca569ed668d3318a75584.tar.gz
sandcrawler-fcc5a1648d2e49e7002ca569ed668d3318a75584.zip
ingest: catch more ConnectionErrors (SPN, replay fetch, GROBID)
Diffstat (limited to 'python/sandcrawler/grobid.py')
-rw-r--r--python/sandcrawler/grobid.py9
1 files changed, 9 insertions, 0 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 1f957da..aa2c112 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -120,6 +120,15 @@ class GrobidClient(object):
"status_code": -4, # heritrix3 "HTTP timeout" code
"error_msg": "GROBID request (HTTP POST) timeout",
}
+ except requests.exceptions.ConnectionError as ce:
+ # intentionally raising this, so workers crash when GROBID
+ # unavailable. but do add a sleep to slow things down.
+ print(
+ "GROBID ConnectionError. sleeping as a slow-down before crashing",
+ file=sys.stderr,
+ )
+ time.sleep(5.0)
+ raise ce
info: Dict[str, Any] = dict(status_code=grobid_response.status_code)
if grobid_response.status_code == 200: