diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-01-21 10:59:27 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-21 10:59:27 -0800 |
commit | a1b44161e206873be30c0640f5fab7a284023ba1 (patch) | |
tree | 9c5f2aa24a6d000e803b19427eb8a66730cd72be /python/sandcrawler/persist.py | |
parent | fb7717ae410f72ff33017c176f64dff556b86f5b (diff) | |
download | sandcrawler-a1b44161e206873be30c0640f5fab7a284023ba1.tar.gz sandcrawler-a1b44161e206873be30c0640f5fab7a284023ba1.zip |
persist: work around GROBID timeouts with no status_code
Diffstat (limited to 'python/sandcrawler/persist.py')
-rw-r--r-- | python/sandcrawler/persist.py | 4 |
1 files changed, 2 insertions, 2 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 2463afa..801f76d 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -223,7 +223,7 @@ class PersistGrobidWorker(SandcrawlerWorker): self.counts['total'] += len(batch) for r in batch: - if r['status_code'] != 200 or not r.get('tei_xml'): + if r.get('status_code') != 200 or not r.get('tei_xml'): self.counts['s3-skip-status'] += 1 if r.get('error_msg'): r['metadata'] = {'error_msg': r['error_msg'][:500]} @@ -291,7 +291,7 @@ class PersistGrobidDiskWorker(SandcrawlerWorker): def process(self, record): - if record['status_code'] != 200 or not record.get('tei_xml'): + if record.get('status_code') != 200 or not record.get('tei_xml'): return False assert(len(record['key'])) == 40 p = "{}/{}".format(self.output_dir, self._blob_path(record['key'])) |