summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/workers/worker_common.py
blob: 9ffbe5fd2b31eed35859aa711d69853118f6deda (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57

import re
import sys
import csv
import json
import itertools
from itertools import islice
from pykafka import KafkaClient
from pykafka.common import OffsetType

import fatcat_openapi_client
from fatcat_openapi_client.rest import ApiException


def most_recent_message(topic):
    """
    Tries to fetch the most recent message from a given topic.
    This only makes sense for single partition topics, though could be
    extended with "last N" behavior.

    Following "Consuming the last N messages from a topic"
    from https://pykafka.readthedocs.io/en/latest/usage.html#consumer-patterns
    """
    consumer = topic.get_simple_consumer(
        auto_offset_reset=OffsetType.LATEST,
        reset_offset_on_start=True)
    offsets = [(p, op.last_offset_consumed - 1)
                for p, op in consumer._partitions.items()]
    offsets = [(p, (o if o > -1 else -2)) for p, o in offsets]
    if -2 in [o for p, o in offsets]:
        consumer.stop()
        return None
    else:
        consumer.reset_offsets(offsets)
        msg = islice(consumer, 1)
        if msg:
            val = list(msg)[0].value
            consumer.stop()
            return val
        else:
            consumer.stop()
            return None

class FatcatWorker:
    """
    Common code for for Kafka producers and consumers.
    """

    def __init__(self, kafka_hosts, produce_topic=None, consume_topic=None, api=None):
        if api:
            self.api = api
        self.kafka = KafkaClient(hosts=kafka_hosts, broker_version="1.0.0")
        self.produce_topic = produce_topic
        self.consume_topic = consume_topic

        # Kafka producer batch size tuning; also limit on size of single document
        self.produce_max_request_size = 10000000  # 10 MByte-ish