aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-12-26 21:19:02 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-02 18:12:58 -0800
commitd6c744002607d8789927cc98fe6c8a6a76da24bd (patch)
treed5942ca78759f00c252aa95734b48881f11b4b0a /python
parentbeba257030d84af2f80c09ec695a35a733a2322d (diff)
downloadsandcrawler-d6c744002607d8789927cc98fe6c8a6a76da24bd.tar.gz
sandcrawler-d6c744002607d8789927cc98fe6c8a6a76da24bd.zip
filter ingest results to not have key conflicts within batch
This handles a corner case with ON CONFLICT ... DO UPDATE where you can't do multiple such updates in the same batch transaction.
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/persist.py17
1 files changed, 16 insertions, 1 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 3b9cde9..ea54d6b 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -156,8 +156,23 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
if not batch:
return []
+ # need to ensure that we aren't trying to update same row multiple
+ # times in same batch (!)
results = [self.file_result_to_row(raw) for raw in batch]
- results = [r for r in results if r]
+ results.reverse()
+ clean_results = []
+ result_keys = []
+ for r in results:
+ if not r:
+ continue
+ key = (r['ingest_type'], r['base_url'])
+ if key in result_keys:
+ self.counts['skip-duplicate-result'] += 1
+ continue
+ result_keys.append(key)
+ clean_results.append(r)
+ results = clean_results
+
requests = [self.request_to_row(raw['request']) for raw in batch if raw.get('request')]
requests = [r for r in requests if r]