aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-10-03 18:30:32 -0700
committerBryan Newbold <bnewbold@archive.org>2019-10-03 18:30:32 -0700
commit4294c40a378c386b8158f563168a29e65553395c (patch)
tree8cad0cf974f811e625abdfbc0f0030a711a8fd07
parent73cfe32c7353d600ccd91eb85f92d044f759d8e4 (diff)
downloadsandcrawler-4294c40a378c386b8158f563168a29e65553395c.tar.gz
sandcrawler-4294c40a378c386b8158f563168a29e65553395c.zip
handle GROBID fetch empty blob condition
-rw-r--r--python/sandcrawler/grobid.py3
1 files changed, 2 insertions, 1 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index cdcb339..4c4112f 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -85,7 +85,8 @@ class GrobidWorker(SandcrawlerWorker):
blob = resp.body
else:
raise ValueError("not a CDX (wayback) or petabox (archive.org) dict; not sure how to proceed")
- assert blob
+ if not blob:
+ return dict(status="error", error_msg="empty blob", source=record)
result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode)
result['file_meta'] = gen_file_metadata(blob)
result['source'] = record