diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2019-10-03 18:30:32 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2019-10-03 18:30:32 -0700 | 
| commit | 4294c40a378c386b8158f563168a29e65553395c (patch) | |
| tree | 8cad0cf974f811e625abdfbc0f0030a711a8fd07 | |
| parent | 73cfe32c7353d600ccd91eb85f92d044f759d8e4 (diff) | |
| download | sandcrawler-4294c40a378c386b8158f563168a29e65553395c.tar.gz sandcrawler-4294c40a378c386b8158f563168a29e65553395c.zip  | |
handle GROBID fetch empty blob condition
| -rw-r--r-- | python/sandcrawler/grobid.py | 3 | 
1 files changed, 2 insertions, 1 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index cdcb339..4c4112f 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -85,7 +85,8 @@ class GrobidWorker(SandcrawlerWorker):              blob = resp.body          else:              raise ValueError("not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed") -        assert blob +        if not blob: +            return dict(status="error", error_msg="empty blob", source=record)          result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode)          result['file_meta'] = gen_file_metadata(blob)          result['source'] = record  | 
