diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-12-01 15:49:41 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-12-01 15:49:41 -0800 |
commit | 6f4f375529e99cbb9c06e49805a8925ffeda269a (patch) | |
tree | 0754ee8c216ced070eb3ad2e6d0cad8832ffd9ca | |
parent | ef769224bb5408eb3dc1df7373d2182d92c92dc7 (diff) | |
download | sandcrawler-6f4f375529e99cbb9c06e49805a8925ffeda269a.tar.gz sandcrawler-6f4f375529e99cbb9c06e49805a8925ffeda269a.zip |
count empty blobs as 'failed' instead of crashing
Might be better to record an artificial kafka response instead?
-rw-r--r-- | python/sandcrawler/grobid.py | 3 |
1 files changed, 2 insertions, 1 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index 06cba3e..63ca73a 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -127,7 +127,8 @@ class GrobidBlobWorker(SandcrawlerWorker): self.consolidate_mode = 2 def process(self, blob): - assert blob + if not blob: + return None result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode) result['file_meta'] = gen_file_metadata(blob) result['key'] = result['file_meta']['sha1hex'] |