convert pipeline workers from pykafka to confluent-kafka

author: Bryan Newbold <bnewbold@robocracy.org> 2019-04-08 21:11:25 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2019-09-20 11:21:10 -0700
commit: 345b24d6a9efbffd0ff3fd3c65e22894b498a2c6 (patch)
tree: a24d992d580e49b877981ff815e02c54615d6ede /python/fatcat_tools/workers/elasticsearch.py
parent: a8cde38d7f95908049d50d0b94ed33f3aa2cc75d (diff)
download: fatcat-345b24d6a9efbffd0ff3fd3c65e22894b498a2c6.tar.gz
fatcat-345b24d6a9efbffd0ff3fd3c65e22894b498a2c6.zip
1 files changed, 66 insertions, 29 deletions
diff --git a/python/fatcat_tools/workers/elasticsearch.py b/python/fatcat_tools/workers/elasticsearch.py
index 91224d98..547e270c 100644
--- a/python/fatcat_tools/workers/elasticsearch.py
+++ b/python/fatcat_tools/workers/elasticsearch.py
@@ -2,7 +2,7 @@
 import json
 import time
 import requests
-from pykafka.common import OffsetType
+from confluent_kafka import Consumer, Producer, KafkaException
 
 from fatcat_openapi_client import ReleaseEntity, ContainerEntity, ApiClient
 from fatcat_tools import *
@@ -18,10 +18,13 @@ class ElasticsearchReleaseWorker(FatcatWorker):
     """
 
     def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None,
-            elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat"):
+            elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat",
+            batch_size=200):
         super().__init__(kafka_hosts=kafka_hosts,
                          consume_topic=consume_topic)
         self.consumer_group = "elasticsearch-updates"
+        self.batch_size = batch_size
+        self.poll_interval = poll_interval
         self.elasticsearch_backend = elasticsearch_backend
         self.elasticsearch_index = elasticsearch_index
         self.entity_type = ReleaseEntity
@@ -29,52 +32,86 @@ class ElasticsearchReleaseWorker(FatcatWorker):
         self.transform_func = release_to_elasticsearch
 
     def run(self):
-        consume_topic = self.kafka.topics[self.consume_topic]
         ac = ApiClient()
 
-        consumer = consume_topic.get_balanced_consumer(
-            consumer_group=self.consumer_group,
-            managed=True,
-            fetch_message_max_bytes=10000000, # up to ~10 MBytes
-            auto_commit_enable=True,
-            auto_commit_interval_ms=30000, # 30 seconds
-            compacted_topic=True,
+        def on_rebalance(consumer, partitions):
+            for p in partitions:
+                if p.error:
+                    raise KafkaException(p.error)
+            print("Kafka partitions rebalanced: {} / {}".format(
+                consumer, partitions))
+
+        consumer_conf = self.kafka_config.copy()
+        consumer_conf.update({
+            'group.id': self.consumer_group,
+            'enable.auto.offset.store': False,
+            'default.topic.config': {
+                'auto.offset.reset': 'latest',
+            },
+        })
+        consumer = Consumer(consumer_conf)
+        consumer.subscribe([self.consume_topic],
+            on_assign=on_rebalance,
+            on_revoke=on_rebalance,
         )
 
-        for msg in consumer:
-            json_str = msg.value.decode('utf-8')
-            # HACK: work around a bug where container entities got published to
-            # release_v03 topic
-            if self.elasticsearch_document_name == "release":
-                entity_dict = json.loads(json_str)
-                if entity_dict.get('name') and not entity_dict.get('title'):
-                    continue
-            entity = entity_from_json(json_str, self.entity_type, api_client=ac)
-            #print(entity)
-            elasticsearch_endpoint = "{}/{}/{}/{}".format(
+        while True:
+            batch = consumer.consume(
+                num_messages=self.batch_size,
+                timeout=self.poll_interval)
+            if not batch:
+                if not consumer.assignment():
+                    print("... no Kafka consumer partitions assigned yet")
+                print("... nothing new from kafka, try again (interval: {}".format(self.poll_interval))
+                continue
+            print("... got {} kafka messages".format(len(batch)))
+            # first check errors on entire batch...
+            for msg in batch:
+                if msg.error():
+                    raise KafkaException(msg.error())
+            # ... then process
+            bulk_actions = []
+            for msg in batch:
+                json_str = msg.value().decode('utf-8')
+                entity = entity_from_json(json_str, ReleaseEntity, api_client=ac)
+                print("Upserting: release/{}".format(entity.ident))
+                bulk_actions.append(json.dumps({
+                    "index": { "_id": entity.ident, },
+                }))
+                bulk_actions.append(json.dumps(
+                    release_to_elasticsearch(entity)))
+            elasticsearch_endpoint = "{}/{}/release/_bulk".format(
                 self.elasticsearch_backend,
-                self.elasticsearch_index,
-                self.elasticsearch_document_name,
-                entity.ident)
-            print("Updating document: {}".format(elasticsearch_endpoint))
-            resp = requests.post(elasticsearch_endpoint, json=self.transform_func(entity))
+                self.elasticsearch_index)
+            resp = requests.post(elasticsearch_endpoint,
+                headers={"Content-Type": "application/x-ndjson"},
+                data="\n".join(bulk_actions) + "\n")
             resp.raise_for_status()
-            #consumer.commit_offsets()
+            if resp.json()['errors']:
+                desc = "Elasticsearch errors from post to {}:".format(elasticsearch_endpoint)
+                print(desc)
+                print(resp.content)
+                raise Exception(desc)
+            consumer.store_offsets(batch[-1])
+
 
 
 class ElasticsearchContainerWorker(ElasticsearchReleaseWorker):
 
     def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None,
-            elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat"):
+            elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat",
+            batch_size=200):
         super().__init__(kafka_hosts=kafka_hosts,
                          consume_topic=consume_topic,
                          poll_interval=poll_interval,
                          offset=offset,
                          elasticsearch_backend=elasticsearch_backend,
-                         elasticsearch_index=elasticsearch_index)
+                         elasticsearch_index=elasticsearch_index,
+                         batch_size=batch_size)
         # previous group got corrupted (by pykafka library?)
         self.consumer_group = "elasticsearch-updates2"
         self.entity_type = ContainerEntity
         self.elasticsearch_document_name = "container"
         self.transform_func = container_to_elasticsearch
 
+
author	Bryan Newbold <bnewbold@robocracy.org>	2019-04-08 21:11:25 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2019-09-20 11:21:10 -0700
commit	345b24d6a9efbffd0ff3fd3c65e22894b498a2c6 (patch)
tree	a24d992d580e49b877981ff815e02c54615d6ede /python/fatcat_tools/workers/elasticsearch.py
parent	a8cde38d7f95908049d50d0b94ed33f3aa2cc75d (diff)
download	fatcat-345b24d6a9efbffd0ff3fd3c65e22894b498a2c6.tar.gz fatcat-345b24d6a9efbffd0ff3fd3c65e22894b498a2c6.zip