diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-04-08 21:11:25 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-09-20 11:21:10 -0700 |
commit | 345b24d6a9efbffd0ff3fd3c65e22894b498a2c6 (patch) | |
tree | a24d992d580e49b877981ff815e02c54615d6ede /python/fatcat_tools/workers/elasticsearch.py | |
parent | a8cde38d7f95908049d50d0b94ed33f3aa2cc75d (diff) | |
download | fatcat-345b24d6a9efbffd0ff3fd3c65e22894b498a2c6.tar.gz fatcat-345b24d6a9efbffd0ff3fd3c65e22894b498a2c6.zip |
convert pipeline workers from pykafka to confluent-kafka
Diffstat (limited to 'python/fatcat_tools/workers/elasticsearch.py')
-rw-r--r-- | python/fatcat_tools/workers/elasticsearch.py | 95 |
1 files changed, 66 insertions, 29 deletions
diff --git a/python/fatcat_tools/workers/elasticsearch.py b/python/fatcat_tools/workers/elasticsearch.py index 91224d98..547e270c 100644 --- a/python/fatcat_tools/workers/elasticsearch.py +++ b/python/fatcat_tools/workers/elasticsearch.py @@ -2,7 +2,7 @@ import json import time import requests -from pykafka.common import OffsetType +from confluent_kafka import Consumer, Producer, KafkaException from fatcat_openapi_client import ReleaseEntity, ContainerEntity, ApiClient from fatcat_tools import * @@ -18,10 +18,13 @@ class ElasticsearchReleaseWorker(FatcatWorker): """ def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None, - elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat"): + elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat", + batch_size=200): super().__init__(kafka_hosts=kafka_hosts, consume_topic=consume_topic) self.consumer_group = "elasticsearch-updates" + self.batch_size = batch_size + self.poll_interval = poll_interval self.elasticsearch_backend = elasticsearch_backend self.elasticsearch_index = elasticsearch_index self.entity_type = ReleaseEntity @@ -29,52 +32,86 @@ class ElasticsearchReleaseWorker(FatcatWorker): self.transform_func = release_to_elasticsearch def run(self): - consume_topic = self.kafka.topics[self.consume_topic] ac = ApiClient() - consumer = consume_topic.get_balanced_consumer( - consumer_group=self.consumer_group, - managed=True, - fetch_message_max_bytes=10000000, # up to ~10 MBytes - auto_commit_enable=True, - auto_commit_interval_ms=30000, # 30 seconds - compacted_topic=True, + def on_rebalance(consumer, partitions): + for p in partitions: + if p.error: + raise KafkaException(p.error) + print("Kafka partitions rebalanced: {} / {}".format( + consumer, partitions)) + + consumer_conf = self.kafka_config.copy() + consumer_conf.update({ + 'group.id': self.consumer_group, + 'enable.auto.offset.store': False, + 'default.topic.config': { + 'auto.offset.reset': 'latest', + }, + }) + consumer = Consumer(consumer_conf) + consumer.subscribe([self.consume_topic], + on_assign=on_rebalance, + on_revoke=on_rebalance, ) - for msg in consumer: - json_str = msg.value.decode('utf-8') - # HACK: work around a bug where container entities got published to - # release_v03 topic - if self.elasticsearch_document_name == "release": - entity_dict = json.loads(json_str) - if entity_dict.get('name') and not entity_dict.get('title'): - continue - entity = entity_from_json(json_str, self.entity_type, api_client=ac) - #print(entity) - elasticsearch_endpoint = "{}/{}/{}/{}".format( + while True: + batch = consumer.consume( + num_messages=self.batch_size, + timeout=self.poll_interval) + if not batch: + if not consumer.assignment(): + print("... no Kafka consumer partitions assigned yet") + print("... nothing new from kafka, try again (interval: {}".format(self.poll_interval)) + continue + print("... got {} kafka messages".format(len(batch))) + # first check errors on entire batch... + for msg in batch: + if msg.error(): + raise KafkaException(msg.error()) + # ... then process + bulk_actions = [] + for msg in batch: + json_str = msg.value().decode('utf-8') + entity = entity_from_json(json_str, ReleaseEntity, api_client=ac) + print("Upserting: release/{}".format(entity.ident)) + bulk_actions.append(json.dumps({ + "index": { "_id": entity.ident, }, + })) + bulk_actions.append(json.dumps( + release_to_elasticsearch(entity))) + elasticsearch_endpoint = "{}/{}/release/_bulk".format( self.elasticsearch_backend, - self.elasticsearch_index, - self.elasticsearch_document_name, - entity.ident) - print("Updating document: {}".format(elasticsearch_endpoint)) - resp = requests.post(elasticsearch_endpoint, json=self.transform_func(entity)) + self.elasticsearch_index) + resp = requests.post(elasticsearch_endpoint, + headers={"Content-Type": "application/x-ndjson"}, + data="\n".join(bulk_actions) + "\n") resp.raise_for_status() - #consumer.commit_offsets() + if resp.json()['errors']: + desc = "Elasticsearch errors from post to {}:".format(elasticsearch_endpoint) + print(desc) + print(resp.content) + raise Exception(desc) + consumer.store_offsets(batch[-1]) + class ElasticsearchContainerWorker(ElasticsearchReleaseWorker): def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None, - elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat"): + elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat", + batch_size=200): super().__init__(kafka_hosts=kafka_hosts, consume_topic=consume_topic, poll_interval=poll_interval, offset=offset, elasticsearch_backend=elasticsearch_backend, - elasticsearch_index=elasticsearch_index) + elasticsearch_index=elasticsearch_index, + batch_size=batch_size) # previous group got corrupted (by pykafka library?) self.consumer_group = "elasticsearch-updates2" self.entity_type = ContainerEntity self.elasticsearch_document_name = "container" self.transform_func = container_to_elasticsearch + |