aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/persist.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-02 18:01:04 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-02 18:12:59 -0800
commit5dc1a8642077b67f3af0a41cdac851bb96a435b7 (patch)
treec40a525c089db607e86a1c14256e0703531d4024 /python/sandcrawler/persist.py
parent6093c9a0c9b65cdf790f200395e2d44d4fe6278b (diff)
downloadsandcrawler-5dc1a8642077b67f3af0a41cdac851bb96a435b7.tar.gz
sandcrawler-5dc1a8642077b67f3af0a41cdac851bb96a435b7.zip
db: move duplicate row filtering into DB insert helpers
Diffstat (limited to 'python/sandcrawler/persist.py')
-rw-r--r--python/sandcrawler/persist.py16
1 files changed, 1 insertions, 15 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index c3f6b08..71ada51 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -156,22 +156,8 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
if not batch:
return []
- # need to ensure that we aren't trying to update same row multiple
- # times in same batch (!)
results = [self.file_result_to_row(raw) for raw in batch]
- results.reverse()
- clean_results = []
- result_keys = []
- for r in results:
- if not r:
- continue
- key = (r['ingest_type'], r['base_url'])
- if key in result_keys:
- self.counts['skip-duplicate-result'] += 1
- continue
- result_keys.append(key)
- clean_results.append(r)
- results = clean_results
+ results = [r for r in results if r]
requests = [self.request_to_row(raw['request']) for raw in batch if raw.get('request')]
requests = [r for r in requests if r]