aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/db.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-02 18:01:04 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-02 18:12:59 -0800
commit5dc1a8642077b67f3af0a41cdac851bb96a435b7 (patch)
treec40a525c089db607e86a1c14256e0703531d4024 /python/sandcrawler/db.py
parent6093c9a0c9b65cdf790f200395e2d44d4fe6278b (diff)
downloadsandcrawler-5dc1a8642077b67f3af0a41cdac851bb96a435b7.tar.gz
sandcrawler-5dc1a8642077b67f3af0a41cdac851bb96a435b7.zip
db: move duplicate row filtering into DB insert helpers
Diffstat (limited to 'python/sandcrawler/db.py')
-rw-r--r--python/sandcrawler/db.py25
1 files changed, 25 insertions, 0 deletions
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index 21ac82a..eb1a922 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -78,6 +78,11 @@ class SandcrawlerPostgresClient:
int(d['warc_csize']),
int(d['warc_offset']))
for d in batch]
+ # filter out duplicate rows by key (url, datetime)
+ batch_dict = dict()
+ for b in batch:
+ batch_dict[(b[0], b[1])] = b
+ batch = list(batch_dict.values())
resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)
@@ -106,6 +111,11 @@ class SandcrawlerPostgresClient:
int(d['size_bytes']),
d['mimetype'])
for d in batch]
+ # filter out duplicate rows by key (sha1hex)
+ batch_dict = dict()
+ for b in batch:
+ batch_dict[b[0]] = b
+ batch = list(batch_dict.values())
resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)
@@ -142,6 +152,11 @@ class SandcrawlerPostgresClient:
d.get('metadata') or None ,
)
for d in batch]
+ # filter out duplicate rows by key (sha1hex)
+ batch_dict = dict()
+ for b in batch:
+ batch_dict[b[0]] = b
+ batch = list(batch_dict.values())
resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)
@@ -173,6 +188,11 @@ class SandcrawlerPostgresClient:
d.get('extra') or None,
)
for d in batch]
+ # filter out duplicate rows by key (link_source, link_source_id, ingest_type, base_url)
+ batch_dict = dict()
+ for b in batch:
+ batch_dict[(b[0], b[1], b[2], b[3])] = b
+ batch = list(batch_dict.values())
resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)
@@ -208,5 +228,10 @@ class SandcrawlerPostgresClient:
d.get('terminal_sha1hex'),
)
for d in batch]
+ # filter out duplicate rows by key (ingest_type, base_url)
+ batch_dict = dict()
+ for b in batch:
+ batch_dict[(b[0], b[1])] = b
+ batch = list(batch_dict.values())
resp = psycopg2.extras.execute_values(cur, sql, batch, page_size=250, fetch=True)
return self._inserts_and_updates(resp, on_conflict)