aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/persist.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-21 10:59:27 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-21 10:59:27 -0800
commita1b44161e206873be30c0640f5fab7a284023ba1 (patch)
tree9c5f2aa24a6d000e803b19427eb8a66730cd72be /python/sandcrawler/persist.py
parentfb7717ae410f72ff33017c176f64dff556b86f5b (diff)
downloadsandcrawler-a1b44161e206873be30c0640f5fab7a284023ba1.tar.gz
sandcrawler-a1b44161e206873be30c0640f5fab7a284023ba1.zip
persist: work around GROBID timeouts with no status_code
Diffstat (limited to 'python/sandcrawler/persist.py')
-rw-r--r--python/sandcrawler/persist.py4
1 files changed, 2 insertions, 2 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 2463afa..801f76d 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -223,7 +223,7 @@ class PersistGrobidWorker(SandcrawlerWorker):
self.counts['total'] += len(batch)
for r in batch:
- if r['status_code'] != 200 or not r.get('tei_xml'):
+ if r.get('status_code') != 200 or not r.get('tei_xml'):
self.counts['s3-skip-status'] += 1
if r.get('error_msg'):
r['metadata'] = {'error_msg': r['error_msg'][:500]}
@@ -291,7 +291,7 @@ class PersistGrobidDiskWorker(SandcrawlerWorker):
def process(self, record):
- if record['status_code'] != 200 or not record.get('tei_xml'):
+ if record.get('status_code') != 200 or not record.get('tei_xml'):
return False
assert(len(record['key'])) == 40
p = "{}/{}".format(self.output_dir, self._blob_path(record['key']))