diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-10-02 18:01:07 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-10-02 18:01:07 -0700 |
commit | d2cd959dd19a10e03ab9e8bbd11787266dbca309 (patch) | |
tree | edc7846196f48fc3ca3997f7a654a641f5875ce4 | |
parent | 5c2f5b575e88c3714958634969af3ef403db0ee8 (diff) | |
download | sandcrawler-d2cd959dd19a10e03ab9e8bbd11787266dbca309.tar.gz sandcrawler-d2cd959dd19a10e03ab9e8bbd11787266dbca309.zip |
have grobidworker error status indicate issues instead of bailing
-rw-r--r-- | python/sandcrawler/grobid.py | 17 |
1 files changed, 13 insertions, 4 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index a610404..cdcb339 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -61,18 +61,27 @@ class GrobidWorker(SandcrawlerWorker): # it's a full CDX dict. fetch using WaybackClient if not self.wayback_client: raise Exception("wayback client not configured for this GrobidWorker") - blob = self.wayback_client.fetch_warc_content(record['warc_path'], - record['warc_offset'], record['warc_csize']) + try: + blob = self.wayback_client.fetch_warc_content(record['warc_path'], + record['warc_offset'], record['warc_csize']) + except WaybackError as we: + return dict(status="error-wayback", error_msg=str(we), source=record) elif record.get('url') and record.get('datetime'): # it's a partial CDX dict or something? fetch using WaybackClient if not self.wayback_client: raise Exception("wayback client not configured for this GrobidWorker") - blob = self.wayback_client.fetch_url_datetime(record['url'], record['datetime']) + try: + blob = self.wayback_client.fetch_url_datetime(record['url'], record['datetime']) + except WaybackError as we: + return dict(status="error-wayback", error_msg=str(we), source=record) elif record.get('item') and record.get('path'): # it's petabox link; fetch via HTTP resp = requests.get("https://archive.org/serve/{}/{}".format( record['item'], record['path'])) - resp.raise_for_status() + try: + resp.raise_for_status() + except Exception as e: + return dict(status="error-petabox", error_msg=str(e), source=record) blob = resp.body else: raise ValueError("not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed") |