aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/sandcrawler/grobid.py17
1 files changed, 13 insertions, 4 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index a610404..cdcb339 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -61,18 +61,27 @@ class GrobidWorker(SandcrawlerWorker):
# it's a full CDX dict. fetch using WaybackClient
if not self.wayback_client:
raise Exception("wayback client not configured for this GrobidWorker")
- blob = self.wayback_client.fetch_warc_content(record['warc_path'],
- record['warc_offset'], record['warc_csize'])
+ try:
+ blob = self.wayback_client.fetch_warc_content(record['warc_path'],
+ record['warc_offset'], record['warc_csize'])
+ except WaybackError as we:
+ return dict(status="error-wayback", error_msg=str(we), source=record)
elif record.get('url') and record.get('datetime'):
# it's a partial CDX dict or something? fetch using WaybackClient
if not self.wayback_client:
raise Exception("wayback client not configured for this GrobidWorker")
- blob = self.wayback_client.fetch_url_datetime(record['url'], record['datetime'])
+ try:
+ blob = self.wayback_client.fetch_url_datetime(record['url'], record['datetime'])
+ except WaybackError as we:
+ return dict(status="error-wayback", error_msg=str(we), source=record)
elif record.get('item') and record.get('path'):
# it's petabox link; fetch via HTTP
resp = requests.get("https://archive.org/serve/{}/{}".format(
record['item'], record['path']))
- resp.raise_for_status()
+ try:
+ resp.raise_for_status()
+ except Exception as e:
+ return dict(status="error-petabox", error_msg=str(e), source=record)
blob = resp.body
else:
raise ValueError("not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed")