diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-01-21 11:32:49 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-21 11:32:51 -0800 |
commit | 20291471b34ea559d2ea5d45f3b05884e54d179a (patch) | |
tree | 772b58f3fe4091e30e9477e43351c1778f421e40 | |
parent | 8b9acb1d31b4b8ae84a5133e947ca0a577cd98d8 (diff) | |
download | sandcrawler-20291471b34ea559d2ea5d45f3b05884e54d179a.tar.gz sandcrawler-20291471b34ea559d2ea5d45f3b05884e54d179a.zip |
persist grobid: actually, status_code is required
Instead of working around when missing, force it to exist but skip in
database insert section.
Disk mode still needs to check if blank.
-rw-r--r-- | python/sandcrawler/db.py | 2 | ||||
-rw-r--r-- | python/sandcrawler/persist.py | 11 |
2 files changed, 10 insertions, 3 deletions
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py index e1414ba..3ec325e 100644 --- a/python/sandcrawler/db.py +++ b/python/sandcrawler/db.py @@ -161,7 +161,7 @@ class SandcrawlerPostgresClient: r['metadata'] = json.dumps(r['metadata'], sort_keys=True) batch = [(d['key'], d.get('grobid_version') or None, - d.get('status_code') or None, + d['status_code'], d['status'], d.get('fatcat_release') or None, d.get('updated') or datetime.datetime.now(), diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 801f76d..9ef3e93 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -222,8 +222,14 @@ class PersistGrobidWorker(SandcrawlerWorker): def push_batch(self, batch): self.counts['total'] += len(batch) + # filter out bad "missing status_code" timeout rows + missing = [r for r in batch if not r.get('status_code')] + if missing: + self.counts['skip-missing-status'] += len(missing) + batch = [r for r in batch if r.get('status_code')] + for r in batch: - if r.get('status_code') != 200 or not r.get('tei_xml'): + if r['status_code'] != 200 or not r.get('tei_xml'): self.counts['s3-skip-status'] += 1 if r.get('error_msg'): r['metadata'] = {'error_msg': r['error_msg'][:500]} @@ -265,7 +271,8 @@ class PersistGrobidWorker(SandcrawlerWorker): self.counts['insert-file-meta'] += resp[0] self.counts['update-file-meta'] += resp[1] - self.db.commit() + self.db.commit() + return [] |