diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-01-21 11:32:49 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-21 11:32:51 -0800 |
commit | 20291471b34ea559d2ea5d45f3b05884e54d179a (patch) | |
tree | 772b58f3fe4091e30e9477e43351c1778f421e40 /python | |
parent | 8b9acb1d31b4b8ae84a5133e947ca0a577cd98d8 (diff) | |
download | sandcrawler-20291471b34ea559d2ea5d45f3b05884e54d179a.tar.gz sandcrawler-20291471b34ea559d2ea5d45f3b05884e54d179a.zip |
persist grobid: actually, status_code is required
Instead of working around when missing, force it to exist but skip in
database insert section.
Disk mode still needs to check if blank.
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/db.py | 2 | ||||
-rw-r--r-- | python/sandcrawler/persist.py | 11 |
2 files changed, 10 insertions, 3 deletions
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py index e1414ba..3ec325e 100644 --- a/python/sandcrawler/db.py +++ b/python/sandcrawler/db.py @@ -161,7 +161,7 @@ class SandcrawlerPostgresClient: r['metadata'] = json.dumps(r['metadata'], sort_keys=True) batch = [(d['key'], d.get('grobid_version') or None, - d.get('status_code') or None, + d['status_code'], d['status'], d.get('fatcat_release') or None, d.get('updated') or datetime.datetime.now(), diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 801f76d..9ef3e93 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -222,8 +222,14 @@ class PersistGrobidWorker(SandcrawlerWorker): def push_batch(self, batch): self.counts['total'] += len(batch) + # filter out bad "missing status_code" timeout rows + missing = [r for r in batch if not r.get('status_code')] + if missing: + self.counts['skip-missing-status'] += len(missing) + batch = [r for r in batch if r.get('status_code')] + for r in batch: - if r.get('status_code') != 200 or not r.get('tei_xml'): + if r['status_code'] != 200 or not r.get('tei_xml'): self.counts['s3-skip-status'] += 1 if r.get('error_msg'): r['metadata'] = {'error_msg': r['error_msg'][:500]} @@ -265,7 +271,8 @@ class PersistGrobidWorker(SandcrawlerWorker): self.counts['insert-file-meta'] += resp[0] self.counts['update-file-meta'] += resp[1] - self.db.commit() + self.db.commit() + return [] |