diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-12-26 21:19:02 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-02 18:12:58 -0800 |
commit | d6c744002607d8789927cc98fe6c8a6a76da24bd (patch) | |
tree | d5942ca78759f00c252aa95734b48881f11b4b0a /python | |
parent | beba257030d84af2f80c09ec695a35a733a2322d (diff) | |
download | sandcrawler-d6c744002607d8789927cc98fe6c8a6a76da24bd.tar.gz sandcrawler-d6c744002607d8789927cc98fe6c8a6a76da24bd.zip |
filter ingest results to not have key conflicts within batch
This handles a corner case with ON CONFLICT ... DO UPDATE where you
can't do multiple such updates in the same batch transaction.
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/persist.py | 17 |
1 files changed, 16 insertions, 1 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 3b9cde9..ea54d6b 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -156,8 +156,23 @@ class PersistIngestFileResultWorker(SandcrawlerWorker): if not batch: return [] + # need to ensure that we aren't trying to update same row multiple + # times in same batch (!) results = [self.file_result_to_row(raw) for raw in batch] - results = [r for r in results if r] + results.reverse() + clean_results = [] + result_keys = [] + for r in results: + if not r: + continue + key = (r['ingest_type'], r['base_url']) + if key in result_keys: + self.counts['skip-duplicate-result'] += 1 + continue + result_keys.append(key) + clean_results.append(r) + results = clean_results + requests = [self.request_to_row(raw['request']) for raw in batch if raw.get('request')] requests = [r for r in requests if r] |