diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2019-12-01 15:49:41 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2019-12-01 15:49:41 -0800 | 
| commit | 6f4f375529e99cbb9c06e49805a8925ffeda269a (patch) | |
| tree | 0754ee8c216ced070eb3ad2e6d0cad8832ffd9ca | |
| parent | ef769224bb5408eb3dc1df7373d2182d92c92dc7 (diff) | |
| download | sandcrawler-6f4f375529e99cbb9c06e49805a8925ffeda269a.tar.gz sandcrawler-6f4f375529e99cbb9c06e49805a8925ffeda269a.zip | |
count empty blobs as 'failed' instead of crashing
Might be better to record an artificial kafka response instead?
| -rw-r--r-- | python/sandcrawler/grobid.py | 3 | 
1 files changed, 2 insertions, 1 deletions
| diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index 06cba3e..63ca73a 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -127,7 +127,8 @@ class GrobidBlobWorker(SandcrawlerWorker):          self.consolidate_mode = 2      def process(self, blob): -        assert blob +        if not blob: +            return None          result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode)          result['file_meta'] = gen_file_metadata(blob)          result['key'] = result['file_meta']['sha1hex'] | 
