diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-12-27 12:45:15 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-02 18:12:58 -0800 |
commit | fe8c71f7f942a02ed144847cca8149486ceaee9a (patch) | |
tree | 443081f167208a893645a74dcbe395ef2d74ae81 /python | |
parent | 759f5cce2c98a19157c82347a2d89f401c8d67c4 (diff) | |
download | sandcrawler-fe8c71f7f942a02ed144847cca8149486ceaee9a.tar.gz sandcrawler-fe8c71f7f942a02ed144847cca8149486ceaee9a.zip |
fix DB import counting
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/db.py | 9 |
1 files changed, 5 insertions, 4 deletions
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py index e89edc7..21ac82a 100644 --- a/python/sandcrawler/db.py +++ b/python/sandcrawler/db.py @@ -46,6 +46,7 @@ class SandcrawlerPostgresClient: return self.conn.commit() def _inserts_and_updates(self, resp, on_conflict): + resp = [int(r[0]) for r in resp] inserts = len([r for r in resp if r == 0]) if on_conflict == "update": updates = len([r for r in resp if r != 0]) @@ -66,16 +67,16 @@ class SandcrawlerPostgresClient: raise NotImplementedError("on_conflict: {}".format(on_conflict)) sql += " RETURNING xmax;" - batch = [d for d in batch if d.get('warc_offset')] + batch = [d for d in batch if d.get('warc_path')] if not batch: - return 0 + return (0, 0) batch = [(d['url'], d['datetime'], d['sha1hex'], d['mimetype'], d['warc_path'], - d['warc_csize'], - d['warc_offset']) + int(d['warc_csize']), + int(d['warc_offset'])) for d in batch] resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True) return self._inserts_and_updates(resp, on_conflict) |