3 files changed, 28 insertions, 2 deletions
diff --git a/notes/performance/kafka_pipeline.txt b/notes/performance/kafka_pipeline.txt
new file mode 100644
index 00000000..f0862d89
--- /dev/null
+++ b/notes/performance/kafka_pipeline.txt
@@ -0,0 +1,17 @@
+
+## Early Notes (2018-11-13)
+
+Ran through about 100k crossref objects, resulting in about 77k messages (in
+about 4k editgroups/changelogs).
+
+Have seen tens of messages per second go through trivially.
+
+The elastic-release worker is the current bottleneck, only some 4.3
+messages/second. Because this worker consumes from 8x partitions, I have a
+feeling it might be consumer group related. kafka-manager shows "0% coverage"
+for this topic. Note that this is a single worker process.
+
+_consumer_offsets is seeing about 36 messages/sec.
+
+Oh, looks like I just needed to enable auto_commit and tune parameters in
+pykafka!
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index 92bb8bdd..6319da2f 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -102,12 +102,17 @@ class FatcatEntityUpdatesWorker(FatcatWorker):
             managed=True,
             auto_offset_reset=OffsetType.LATEST,
             reset_offset_on_start=False,
+            fetch_message_max_bytes=4000000, # up to ~4MBytes
+            auto_commit_enable=True,
+            auto_commit_interval_ms=30000, # 30 seconds
+            compacted_topic=True,
         )
 
         with release_topic.get_sync_producer() as producer:
             for msg in consumer:
                 cle = json.loads(msg.value.decode('utf-8'))
                 #print(cle)
+                print("processing changelog index {}".format(cle['index']))
                 release_edits = cle['editgroup']['edits']['releases']
                 for re in release_edits:
                     ident = re['ident']
@@ -118,5 +123,5 @@ class FatcatEntityUpdatesWorker(FatcatWorker):
                         partition_key=ident.encode('utf-8'),
                         timestamp=None,
                     )
-                consumer.commit_offsets()
+                #consumer.commit_offsets()
 
diff --git a/python/fatcat_tools/workers/elastic.py b/python/fatcat_tools/workers/elastic.py
index 119a6a26..3a75a1b3 100644
--- a/python/fatcat_tools/workers/elastic.py
+++ b/python/fatcat_tools/workers/elastic.py
@@ -32,6 +32,10 @@ class FatcatElasticReleaseWorker(FatcatWorker):
         consumer = consume_topic.get_balanced_consumer(
             consumer_group=self.consumer_group,
             managed=True,
+            fetch_message_max_bytes=4000000, # up to ~4MBytes
+            auto_commit_enable=True,
+            auto_commit_interval_ms=30000, # 30 seconds
+            compacted_topic=True,
         )
 
         for msg in consumer:
@@ -45,4 +49,4 @@ class FatcatElasticReleaseWorker(FatcatWorker):
             print("Updating document: {}".format(elastic_endpoint))
             resp = requests.post(elastic_endpoint, json=release_elastic_dict(release))
             assert resp.status_code in (200, 201)
-            consumer.commit_offsets()
+            #consumer.commit_offsets()