aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/sandcrawler/db.py9
1 files changed, 5 insertions, 4 deletions
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index e89edc7..21ac82a 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -46,6 +46,7 @@ class SandcrawlerPostgresClient:
return self.conn.commit()
def _inserts_and_updates(self, resp, on_conflict):
+ resp = [int(r[0]) for r in resp]
inserts = len([r for r in resp if r == 0])
if on_conflict == "update":
updates = len([r for r in resp if r != 0])
@@ -66,16 +67,16 @@ class SandcrawlerPostgresClient:
raise NotImplementedError("on_conflict: {}".format(on_conflict))
sql += " RETURNING xmax;"
- batch = [d for d in batch if d.get('warc_offset')]
+ batch = [d for d in batch if d.get('warc_path')]
if not batch:
- return 0
+ return (0, 0)
batch = [(d['url'],
d['datetime'],
d['sha1hex'],
d['mimetype'],
d['warc_path'],
- d['warc_csize'],
- d['warc_offset'])
+ int(d['warc_csize']),
+ int(d['warc_offset']))
for d in batch]
resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)