aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-10-02 18:01:07 -0700
committerBryan Newbold <bnewbold@archive.org>2019-10-02 18:01:07 -0700
commitd2cd959dd19a10e03ab9e8bbd11787266dbca309 (patch)
treeedc7846196f48fc3ca3997f7a654a641f5875ce4
parent5c2f5b575e88c3714958634969af3ef403db0ee8 (diff)
downloadsandcrawler-d2cd959dd19a10e03ab9e8bbd11787266dbca309.tar.gz
sandcrawler-d2cd959dd19a10e03ab9e8bbd11787266dbca309.zip
have grobidworker error status indicate issues instead of bailing
-rw-r--r--python/sandcrawler/grobid.py17
1 files changed, 13 insertions, 4 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index a610404..cdcb339 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -61,18 +61,27 @@ class GrobidWorker(SandcrawlerWorker):
# it's a full CDX dict. fetch using WaybackClient
if not self.wayback_client:
raise Exception("wayback client not configured for this GrobidWorker")
- blob = self.wayback_client.fetch_warc_content(record['warc_path'],
- record['warc_offset'], record['warc_csize'])
+ try:
+ blob = self.wayback_client.fetch_warc_content(record['warc_path'],
+ record['warc_offset'], record['warc_csize'])
+ except WaybackError as we:
+ return dict(status="error-wayback", error_msg=str(we), source=record)
elif record.get('url') and record.get('datetime'):
# it's a partial CDX dict or something? fetch using WaybackClient
if not self.wayback_client:
raise Exception("wayback client not configured for this GrobidWorker")
- blob = self.wayback_client.fetch_url_datetime(record['url'], record['datetime'])
+ try:
+ blob = self.wayback_client.fetch_url_datetime(record['url'], record['datetime'])
+ except WaybackError as we:
+ return dict(status="error-wayback", error_msg=str(we), source=record)
elif record.get('item') and record.get('path'):
# it's petabox link; fetch via HTTP
resp = requests.get("https://archive.org/serve/{}/{}".format(
record['item'], record['path']))
- resp.raise_for_status()
+ try:
+ resp.raise_for_status()
+ except Exception as e:
+ return dict(status="error-petabox", error_msg=str(e), source=record)
blob = resp.body
else:
raise ValueError("not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed")