From fe8c71f7f942a02ed144847cca8149486ceaee9a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 27 Dec 2019 12:45:15 -0800 Subject: fix DB import counting --- python/sandcrawler/db.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'python/sandcrawler/db.py') diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py index e89edc7..21ac82a 100644 --- a/python/sandcrawler/db.py +++ b/python/sandcrawler/db.py @@ -46,6 +46,7 @@ class SandcrawlerPostgresClient: return self.conn.commit() def _inserts_and_updates(self, resp, on_conflict): + resp = [int(r[0]) for r in resp] inserts = len([r for r in resp if r == 0]) if on_conflict == "update": updates = len([r for r in resp if r != 0]) @@ -66,16 +67,16 @@ class SandcrawlerPostgresClient: raise NotImplementedError("on_conflict: {}".format(on_conflict)) sql += " RETURNING xmax;" - batch = [d for d in batch if d.get('warc_offset')] + batch = [d for d in batch if d.get('warc_path')] if not batch: - return 0 + return (0, 0) batch = [(d['url'], d['datetime'], d['sha1hex'], d['mimetype'], d['warc_path'], - d['warc_csize'], - d['warc_offset']) + int(d['warc_csize']), + int(d['warc_offset'])) for d in batch] resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True) return self._inserts_and_updates(resp, on_conflict) -- cgit v1.2.3