From 4294c40a378c386b8158f563168a29e65553395c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 3 Oct 2019 18:30:32 -0700 Subject: handle GROBID fetch empty blob condition --- python/sandcrawler/grobid.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index cdcb339..4c4112f 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -85,7 +85,8 @@ class GrobidWorker(SandcrawlerWorker): blob = resp.body else: raise ValueError("not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed") - assert blob + if not blob: + return dict(status="error", error_msg="empty blob", source=record) result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode) result['file_meta'] = gen_file_metadata(blob) result['source'] = record -- cgit v1.2.3