diff options
| -rw-r--r-- | notes/performance/kafka_pipeline.txt | 17 | ||||
| -rw-r--r-- | python/fatcat_tools/workers/changelog.py | 7 | ||||
| -rw-r--r-- | python/fatcat_tools/workers/elastic.py | 6 | 
3 files changed, 28 insertions, 2 deletions
| diff --git a/notes/performance/kafka_pipeline.txt b/notes/performance/kafka_pipeline.txt new file mode 100644 index 00000000..f0862d89 --- /dev/null +++ b/notes/performance/kafka_pipeline.txt @@ -0,0 +1,17 @@ + +## Early Notes (2018-11-13) + +Ran through about 100k crossref objects, resulting in about 77k messages (in +about 4k editgroups/changelogs). + +Have seen tens of messages per second go through trivially. + +The elastic-release worker is the current bottleneck, only some 4.3 +messages/second. Because this worker consumes from 8x partitions, I have a +feeling it might be consumer group related. kafka-manager shows "0% coverage" +for this topic. Note that this is a single worker process. + +_consumer_offsets is seeing about 36 messages/sec. + +Oh, looks like I just needed to enable auto_commit and tune parameters in +pykafka! diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index 92bb8bdd..6319da2f 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -102,12 +102,17 @@ class FatcatEntityUpdatesWorker(FatcatWorker):              managed=True,              auto_offset_reset=OffsetType.LATEST,              reset_offset_on_start=False, +            fetch_message_max_bytes=4000000, # up to ~4MBytes +            auto_commit_enable=True, +            auto_commit_interval_ms=30000, # 30 seconds +            compacted_topic=True,          )          with release_topic.get_sync_producer() as producer:              for msg in consumer:                  cle = json.loads(msg.value.decode('utf-8'))                  #print(cle) +                print("processing changelog index {}".format(cle['index']))                  release_edits = cle['editgroup']['edits']['releases']                  for re in release_edits:                      ident = re['ident'] @@ -118,5 +123,5 @@ class FatcatEntityUpdatesWorker(FatcatWorker):                          partition_key=ident.encode('utf-8'),                          timestamp=None,                      ) -                consumer.commit_offsets() +                #consumer.commit_offsets() diff --git a/python/fatcat_tools/workers/elastic.py b/python/fatcat_tools/workers/elastic.py index 119a6a26..3a75a1b3 100644 --- a/python/fatcat_tools/workers/elastic.py +++ b/python/fatcat_tools/workers/elastic.py @@ -32,6 +32,10 @@ class FatcatElasticReleaseWorker(FatcatWorker):          consumer = consume_topic.get_balanced_consumer(              consumer_group=self.consumer_group,              managed=True, +            fetch_message_max_bytes=4000000, # up to ~4MBytes +            auto_commit_enable=True, +            auto_commit_interval_ms=30000, # 30 seconds +            compacted_topic=True,          )          for msg in consumer: @@ -45,4 +49,4 @@ class FatcatElasticReleaseWorker(FatcatWorker):              print("Updating document: {}".format(elastic_endpoint))              resp = requests.post(elastic_endpoint, json=release_elastic_dict(release))              assert resp.status_code in (200, 201) -            consumer.commit_offsets() +            #consumer.commit_offsets() | 
