summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/workers/elasticsearch.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-04-08 21:11:25 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-09-20 11:21:10 -0700
commit345b24d6a9efbffd0ff3fd3c65e22894b498a2c6 (patch)
treea24d992d580e49b877981ff815e02c54615d6ede /python/fatcat_tools/workers/elasticsearch.py
parenta8cde38d7f95908049d50d0b94ed33f3aa2cc75d (diff)
downloadfatcat-345b24d6a9efbffd0ff3fd3c65e22894b498a2c6.tar.gz
fatcat-345b24d6a9efbffd0ff3fd3c65e22894b498a2c6.zip
convert pipeline workers from pykafka to confluent-kafka
Diffstat (limited to 'python/fatcat_tools/workers/elasticsearch.py')
-rw-r--r--python/fatcat_tools/workers/elasticsearch.py95
1 files changed, 66 insertions, 29 deletions
diff --git a/python/fatcat_tools/workers/elasticsearch.py b/python/fatcat_tools/workers/elasticsearch.py
index 91224d98..547e270c 100644
--- a/python/fatcat_tools/workers/elasticsearch.py
+++ b/python/fatcat_tools/workers/elasticsearch.py
@@ -2,7 +2,7 @@
import json
import time
import requests
-from pykafka.common import OffsetType
+from confluent_kafka import Consumer, Producer, KafkaException
from fatcat_openapi_client import ReleaseEntity, ContainerEntity, ApiClient
from fatcat_tools import *
@@ -18,10 +18,13 @@ class ElasticsearchReleaseWorker(FatcatWorker):
"""
def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None,
- elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat"):
+ elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat",
+ batch_size=200):
super().__init__(kafka_hosts=kafka_hosts,
consume_topic=consume_topic)
self.consumer_group = "elasticsearch-updates"
+ self.batch_size = batch_size
+ self.poll_interval = poll_interval
self.elasticsearch_backend = elasticsearch_backend
self.elasticsearch_index = elasticsearch_index
self.entity_type = ReleaseEntity
@@ -29,52 +32,86 @@ class ElasticsearchReleaseWorker(FatcatWorker):
self.transform_func = release_to_elasticsearch
def run(self):
- consume_topic = self.kafka.topics[self.consume_topic]
ac = ApiClient()
- consumer = consume_topic.get_balanced_consumer(
- consumer_group=self.consumer_group,
- managed=True,
- fetch_message_max_bytes=10000000, # up to ~10 MBytes
- auto_commit_enable=True,
- auto_commit_interval_ms=30000, # 30 seconds
- compacted_topic=True,
+ def on_rebalance(consumer, partitions):
+ for p in partitions:
+ if p.error:
+ raise KafkaException(p.error)
+ print("Kafka partitions rebalanced: {} / {}".format(
+ consumer, partitions))
+
+ consumer_conf = self.kafka_config.copy()
+ consumer_conf.update({
+ 'group.id': self.consumer_group,
+ 'enable.auto.offset.store': False,
+ 'default.topic.config': {
+ 'auto.offset.reset': 'latest',
+ },
+ })
+ consumer = Consumer(consumer_conf)
+ consumer.subscribe([self.consume_topic],
+ on_assign=on_rebalance,
+ on_revoke=on_rebalance,
)
- for msg in consumer:
- json_str = msg.value.decode('utf-8')
- # HACK: work around a bug where container entities got published to
- # release_v03 topic
- if self.elasticsearch_document_name == "release":
- entity_dict = json.loads(json_str)
- if entity_dict.get('name') and not entity_dict.get('title'):
- continue
- entity = entity_from_json(json_str, self.entity_type, api_client=ac)
- #print(entity)
- elasticsearch_endpoint = "{}/{}/{}/{}".format(
+ while True:
+ batch = consumer.consume(
+ num_messages=self.batch_size,
+ timeout=self.poll_interval)
+ if not batch:
+ if not consumer.assignment():
+ print("... no Kafka consumer partitions assigned yet")
+ print("... nothing new from kafka, try again (interval: {}".format(self.poll_interval))
+ continue
+ print("... got {} kafka messages".format(len(batch)))
+ # first check errors on entire batch...
+ for msg in batch:
+ if msg.error():
+ raise KafkaException(msg.error())
+ # ... then process
+ bulk_actions = []
+ for msg in batch:
+ json_str = msg.value().decode('utf-8')
+ entity = entity_from_json(json_str, ReleaseEntity, api_client=ac)
+ print("Upserting: release/{}".format(entity.ident))
+ bulk_actions.append(json.dumps({
+ "index": { "_id": entity.ident, },
+ }))
+ bulk_actions.append(json.dumps(
+ release_to_elasticsearch(entity)))
+ elasticsearch_endpoint = "{}/{}/release/_bulk".format(
self.elasticsearch_backend,
- self.elasticsearch_index,
- self.elasticsearch_document_name,
- entity.ident)
- print("Updating document: {}".format(elasticsearch_endpoint))
- resp = requests.post(elasticsearch_endpoint, json=self.transform_func(entity))
+ self.elasticsearch_index)
+ resp = requests.post(elasticsearch_endpoint,
+ headers={"Content-Type": "application/x-ndjson"},
+ data="\n".join(bulk_actions) + "\n")
resp.raise_for_status()
- #consumer.commit_offsets()
+ if resp.json()['errors']:
+ desc = "Elasticsearch errors from post to {}:".format(elasticsearch_endpoint)
+ print(desc)
+ print(resp.content)
+ raise Exception(desc)
+ consumer.store_offsets(batch[-1])
+
class ElasticsearchContainerWorker(ElasticsearchReleaseWorker):
def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None,
- elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat"):
+ elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat",
+ batch_size=200):
super().__init__(kafka_hosts=kafka_hosts,
consume_topic=consume_topic,
poll_interval=poll_interval,
offset=offset,
elasticsearch_backend=elasticsearch_backend,
- elasticsearch_index=elasticsearch_index)
+ elasticsearch_index=elasticsearch_index,
+ batch_size=batch_size)
# previous group got corrupted (by pykafka library?)
self.consumer_group = "elasticsearch-updates2"
self.entity_type = ContainerEntity
self.elasticsearch_document_name = "container"
self.transform_func = container_to_elasticsearch
+