From 7634f6ecf2361b1cb1cafd4e27fd1fb84d81d130 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 13 Nov 2018 23:48:45 -0800 Subject: switch to auto consumer offset updates This is the classic/correct way to do consumer group updates for higher throughput, when "at least once" semantics are acceptible (as they are here; double processing should be safe/fine). --- python/fatcat_tools/workers/changelog.py | 7 ++++++- python/fatcat_tools/workers/elastic.py | 6 +++++- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'python') diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index 92bb8bdd..6319da2f 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -102,12 +102,17 @@ class FatcatEntityUpdatesWorker(FatcatWorker): managed=True, auto_offset_reset=OffsetType.LATEST, reset_offset_on_start=False, + fetch_message_max_bytes=4000000, # up to ~4MBytes + auto_commit_enable=True, + auto_commit_interval_ms=30000, # 30 seconds + compacted_topic=True, ) with release_topic.get_sync_producer() as producer: for msg in consumer: cle = json.loads(msg.value.decode('utf-8')) #print(cle) + print("processing changelog index {}".format(cle['index'])) release_edits = cle['editgroup']['edits']['releases'] for re in release_edits: ident = re['ident'] @@ -118,5 +123,5 @@ class FatcatEntityUpdatesWorker(FatcatWorker): partition_key=ident.encode('utf-8'), timestamp=None, ) - consumer.commit_offsets() + #consumer.commit_offsets() diff --git a/python/fatcat_tools/workers/elastic.py b/python/fatcat_tools/workers/elastic.py index 119a6a26..3a75a1b3 100644 --- a/python/fatcat_tools/workers/elastic.py +++ b/python/fatcat_tools/workers/elastic.py @@ -32,6 +32,10 @@ class FatcatElasticReleaseWorker(FatcatWorker): consumer = consume_topic.get_balanced_consumer( consumer_group=self.consumer_group, managed=True, + fetch_message_max_bytes=4000000, # up to ~4MBytes + auto_commit_enable=True, + auto_commit_interval_ms=30000, # 30 seconds + compacted_topic=True, ) for msg in consumer: @@ -45,4 +49,4 @@ class FatcatElasticReleaseWorker(FatcatWorker): print("Updating document: {}".format(elastic_endpoint)) resp = requests.post(elastic_endpoint, json=release_elastic_dict(release)) assert resp.status_code in (200, 201) - consumer.commit_offsets() + #consumer.commit_offsets() -- cgit v1.2.3