aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-21 10:59:27 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-21 10:59:27 -0800
commita1b44161e206873be30c0640f5fab7a284023ba1 (patch)
tree9c5f2aa24a6d000e803b19427eb8a66730cd72be
parentfb7717ae410f72ff33017c176f64dff556b86f5b (diff)
downloadsandcrawler-a1b44161e206873be30c0640f5fab7a284023ba1.tar.gz
sandcrawler-a1b44161e206873be30c0640f5fab7a284023ba1.zip
persist: work around GROBID timeouts with no status_code
-rw-r--r--python/sandcrawler/db.py2
-rw-r--r--python/sandcrawler/persist.py4
2 files changed, 3 insertions, 3 deletions
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index 3ec325e..e1414ba 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -161,7 +161,7 @@ class SandcrawlerPostgresClient:
r['metadata'] = json.dumps(r['metadata'], sort_keys=True)
batch = [(d['key'],
d.get('grobid_version') or None,
- d['status_code'],
+ d.get('status_code') or None,
d['status'],
d.get('fatcat_release') or None,
d.get('updated') or datetime.datetime.now(),
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 2463afa..801f76d 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -223,7 +223,7 @@ class PersistGrobidWorker(SandcrawlerWorker):
self.counts['total'] += len(batch)
for r in batch:
- if r['status_code'] != 200 or not r.get('tei_xml'):
+ if r.get('status_code') != 200 or not r.get('tei_xml'):
self.counts['s3-skip-status'] += 1
if r.get('error_msg'):
r['metadata'] = {'error_msg': r['error_msg'][:500]}
@@ -291,7 +291,7 @@ class PersistGrobidDiskWorker(SandcrawlerWorker):
def process(self, record):
- if record['status_code'] != 200 or not record.get('tei_xml'):
+ if record.get('status_code') != 200 or not record.get('tei_xml'):
return False
assert(len(record['key'])) == 40
p = "{}/{}".format(self.output_dir, self._blob_path(record['key']))