aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-12-01 15:49:41 -0800
committerBryan Newbold <bnewbold@archive.org>2019-12-01 15:49:41 -0800
commit6f4f375529e99cbb9c06e49805a8925ffeda269a (patch)
tree0754ee8c216ced070eb3ad2e6d0cad8832ffd9ca /python
parentef769224bb5408eb3dc1df7373d2182d92c92dc7 (diff)
downloadsandcrawler-6f4f375529e99cbb9c06e49805a8925ffeda269a.tar.gz
sandcrawler-6f4f375529e99cbb9c06e49805a8925ffeda269a.zip
count empty blobs as 'failed' instead of crashing
Might be better to record an artificial kafka response instead?
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/grobid.py3
1 files changed, 2 insertions, 1 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 06cba3e..63ca73a 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -127,7 +127,8 @@ class GrobidBlobWorker(SandcrawlerWorker):
self.consolidate_mode = 2
def process(self, blob):
- assert blob
+ if not blob:
+ return None
result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode)
result['file_meta'] = gen_file_metadata(blob)
result['key'] = result['file_meta']['sha1hex']