aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/workers/elasticsearch.py
blob: 91224d98e91a0c5bea17d31f580f658a80a831ac (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80

import json
import time
import requests
from pykafka.common import OffsetType

from fatcat_openapi_client import ReleaseEntity, ContainerEntity, ApiClient
from fatcat_tools import *
from .worker_common import FatcatWorker


class ElasticsearchReleaseWorker(FatcatWorker):
    """
    Consumes from release-updates topic and pushes into (presumably local)
    elasticsearch.

    Uses a consumer group to manage offset.
    """

    def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None,
            elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat"):
        super().__init__(kafka_hosts=kafka_hosts,
                         consume_topic=consume_topic)
        self.consumer_group = "elasticsearch-updates"
        self.elasticsearch_backend = elasticsearch_backend
        self.elasticsearch_index = elasticsearch_index
        self.entity_type = ReleaseEntity
        self.elasticsearch_document_name = "release"
        self.transform_func = release_to_elasticsearch

    def run(self):
        consume_topic = self.kafka.topics[self.consume_topic]
        ac = ApiClient()

        consumer = consume_topic.get_balanced_consumer(
            consumer_group=self.consumer_group,
            managed=True,
            fetch_message_max_bytes=10000000, # up to ~10 MBytes
            auto_commit_enable=True,
            auto_commit_interval_ms=30000, # 30 seconds
            compacted_topic=True,
        )

        for msg in consumer:
            json_str = msg.value.decode('utf-8')
            # HACK: work around a bug where container entities got published to
            # release_v03 topic
            if self.elasticsearch_document_name == "release":
                entity_dict = json.loads(json_str)
                if entity_dict.get('name') and not entity_dict.get('title'):
                    continue
            entity = entity_from_json(json_str, self.entity_type, api_client=ac)
            #print(entity)
            elasticsearch_endpoint = "{}/{}/{}/{}".format(
                self.elasticsearch_backend,
                self.elasticsearch_index,
                self.elasticsearch_document_name,
                entity.ident)
            print("Updating document: {}".format(elasticsearch_endpoint))
            resp = requests.post(elasticsearch_endpoint, json=self.transform_func(entity))
            resp.raise_for_status()
            #consumer.commit_offsets()


class ElasticsearchContainerWorker(ElasticsearchReleaseWorker):

    def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None,
            elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat"):
        super().__init__(kafka_hosts=kafka_hosts,
                         consume_topic=consume_topic,
                         poll_interval=poll_interval,
                         offset=offset,
                         elasticsearch_backend=elasticsearch_backend,
                         elasticsearch_index=elasticsearch_index)
        # previous group got corrupted (by pykafka library?)
        self.consumer_group = "elasticsearch-updates2"
        self.entity_type = ContainerEntity
        self.elasticsearch_document_name = "container"
        self.transform_func = container_to_elasticsearch