From a14d851ad230b3adb569ec6ca112cd4d9e638b2c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 18 Apr 2019 16:23:09 -0700 Subject: mechanism to not double-update entities --- python/fatcat_tools/importers/arabesque.py | 7 ++++++- python/fatcat_tools/importers/common.py | 3 +++ 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'python') diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py index c0311903..c4850592 100644 --- a/python/fatcat_tools/importers/arabesque.py +++ b/python/fatcat_tools/importers/arabesque.py @@ -149,6 +149,10 @@ class ArabesqueMatchImporter(EntityImporter): self.counts['skip-update-disabled'] += 1 return False + if existing.ident in [e.ident for e in self._edits_inflight]: + self.counts['skip-update-inflight'] += 1 + return False + # TODO: this code path never gets hit because of the check above if set(fe.release_ids) == set(existing.release_ids): existing_urls = set([u.url for u in existing.urls]) @@ -162,7 +166,8 @@ class ArabesqueMatchImporter(EntityImporter): existing.urls = [fatcat_client.FileEntityUrls(rel=rel, url=url) for (rel, url) in existing.urls] existing.release_ids = list(set(fe.release_ids + existing.release_ids)) existing.mimetype = existing.mimetype or fe.mimetype - self.api.update_file(existing.ident, existing, editgroup_id=self.get_editgroup_id()) + edit = self.api.update_file(existing.ident, existing, editgroup_id=self.get_editgroup_id()) + self._edits_inflight.append(edit) self.counts['update'] += 1 return False diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index dd30e198..49931542 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -171,6 +171,7 @@ class EntityImporter: self._edit_count = 0 self._editgroup_id = None self._entity_queue = [] + self._edits_inflight = [] def push_record(self, raw_record): """ @@ -199,6 +200,7 @@ class EntityImporter: self.api.accept_editgroup(self._editgroup_id) self._editgroup_id = None self._edit_count = 0 + self._edits_inflight = [] if self._entity_queue: self.insert_batch(self._entity_queue) @@ -215,6 +217,7 @@ class EntityImporter: self.api.accept_editgroup(self._editgroup_id) self._editgroup_id = None self._edit_count = 0 + self._edits_inflight = [] if not self._editgroup_id: eg = self.api.create_editgroup( -- cgit v1.2.3