aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-12-27 12:45:15 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-02 18:12:58 -0800
commitfe8c71f7f942a02ed144847cca8149486ceaee9a (patch)
tree443081f167208a893645a74dcbe395ef2d74ae81
parent759f5cce2c98a19157c82347a2d89f401c8d67c4 (diff)
downloadsandcrawler-fe8c71f7f942a02ed144847cca8149486ceaee9a.tar.gz
sandcrawler-fe8c71f7f942a02ed144847cca8149486ceaee9a.zip
fix DB import counting
-rw-r--r--python/sandcrawler/db.py9
1 files changed, 5 insertions, 4 deletions
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index e89edc7..21ac82a 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -46,6 +46,7 @@ class SandcrawlerPostgresClient:
return self.conn.commit()
def _inserts_and_updates(self, resp, on_conflict):
+ resp = [int(r[0]) for r in resp]
inserts = len([r for r in resp if r == 0])
if on_conflict == "update":
updates = len([r for r in resp if r != 0])
@@ -66,16 +67,16 @@ class SandcrawlerPostgresClient:
raise NotImplementedError("on_conflict: {}".format(on_conflict))
sql += " RETURNING xmax;"
- batch = [d for d in batch if d.get('warc_offset')]
+ batch = [d for d in batch if d.get('warc_path')]
if not batch:
- return 0
+ return (0, 0)
batch = [(d['url'],
d['datetime'],
d['sha1hex'],
d['mimetype'],
d['warc_path'],
- d['warc_csize'],
- d['warc_offset'])
+ int(d['warc_csize']),
+ int(d['warc_offset']))
for d in batch]
resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)