From 881b46e3b1682974f48fc196f483c3fa2648b998 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 4 Nov 2018 17:26:28 -0800 Subject: first-draft kafka workers (changelog, release_update) --- python/fatcat/changelog_workers.py | 120 +++++++++++++++++++++++++++++++++++++ python/fatcat/worker_common.py | 25 ++++++++ python/fatcat_worker.py | 52 ++++++++++++++++ 3 files changed, 197 insertions(+) create mode 100644 python/fatcat/changelog_workers.py create mode 100644 python/fatcat/worker_common.py create mode 100755 python/fatcat_worker.py diff --git a/python/fatcat/changelog_workers.py b/python/fatcat/changelog_workers.py new file mode 100644 index 00000000..5f8621cf --- /dev/null +++ b/python/fatcat/changelog_workers.py @@ -0,0 +1,120 @@ + +import json +import time +from itertools import islice +from fatcat.worker_common import FatcatWorker +from pykafka.common import OffsetType + + +class FatcatChangelogWorker(FatcatWorker): + """ + Periodically polls the fatcat API looking for new changelogs. When they are + found, fetch them and push (as JSON) into a Kafka topic. + """ + + def __init__(self, api_host_url, kafka_hosts, produce_topic, poll_interval=10.0, offset=None): + # TODO: should be offset=0 + super().__init__(kafka_hosts=kafka_hosts, + produce_topic=produce_topic, + api_host_url=api_host_url) + self.poll_interval = poll_interval + self.offset = offset # the fatcat changelog offset, not the kafka offset + + def most_recent_message(self, topic): + """ + Tries to fetch the most recent message from a given topic. + This only makes sense for single partition topics, though could be + extended with "last N" behavior. + + Following "Consuming the last N messages from a topic" + from https://pykafka.readthedocs.io/en/latest/usage.html#consumer-patterns + """ + consumer = topic.get_simple_consumer( + auto_offset_reset=OffsetType.LATEST, + reset_offset_on_start=True) + offsets = [(p, op.last_offset_consumed - 1) + for p, op in consumer._partitions.items()] + offsets = [(p, (o if o > -1 else -2)) for p, o in offsets] + if -2 in [o for p, o in offsets]: + return None + else: + consumer.reset_offsets(offsets) + msg = islice(consumer, 1) + if msg: + return list(msg)[0].value + else: + return None + + def run(self): + topic = self.kafka.topics[self.produce_topic] + # On start, try to consume the most recent from the topic, and using + # that as the starting offset. Note that this is a single-partition + # topic + if self.offset is None: + print("Checking for most recent changelog offset...") + msg = self.most_recent_message(topic) + if msg: + self.offset = json.loads(msg.decode('utf-8'))['index'] + else: + self.offset = 1 + + with topic.get_sync_producer() as producer: + while True: + latest = int(self.api.get_changelog(limit=1)[0].index) + if latest > self.offset: + print("Fetching changelogs from {} through {}".format( + self.offset+1, latest)) + for i in range(self.offset+1, latest+1): + cle = self.api.get_changelog_entry(i) + obj = self.api.api_client.sanitize_for_serialization(cle) + producer.produce( + message=json.dumps(obj).encode('utf-8'), + partition_key=None, + timestamp=None, + #XXX: timestamp=cle.timestamp, + ) + self.offset = i + print("Sleeping {} seconds...".format(self.poll_interval)) + time.sleep(self.poll_interval) + + +class FatcatEntityUpdatesWorker(FatcatWorker): + """ + Consumes from the changelog topic and publishes expanded entities (fetched + from API) to update topics. + + For now, only release updates are published. + """ + + def __init__(self, api_host_url, kafka_hosts, consume_topic, release_topic): + super().__init__(kafka_hosts=kafka_hosts, + consume_topic=consume_topic, + api_host_url=api_host_url) + self.release_topic = release_topic + self.consumer_group = "entity-updates" + + def run(self): + changelog_topic = self.kafka.topics[self.consume_topic] + release_topic = self.kafka.topics[self.release_topic] + + consumer = changelog_topic.get_simple_consumer( + consumer_group=self.consumer_group, + auto_offset_reset=OffsetType.LATEST, + reset_offset_on_start=False, + ) + + with release_topic.get_sync_producer() as producer: + for msg in consumer: + cle = json.loads(msg.value.decode('utf-8')) + #print(cle) + release_edits = cle['editgroup']['edits']['releases'] + for re in release_edits: + ident = re['ident'] + release = self.api.get_release(ident, expand="files,container") + release_dict = self.api.api_client.sanitize_for_serialization(release) + producer.produce( + message=json.dumps(release_dict).encode('utf-8'), + partition_key=ident.encode('utf-8'), + timestamp=None, + ) + diff --git a/python/fatcat/worker_common.py b/python/fatcat/worker_common.py new file mode 100644 index 00000000..77ea2c15 --- /dev/null +++ b/python/fatcat/worker_common.py @@ -0,0 +1,25 @@ + +import re +import sys +import csv +import json +import itertools +import fatcat_client +from pykafka import KafkaClient +from fatcat_client.rest import ApiException + + +class FatcatWorker: + """ + Common code for for Kafka producers and consumers. + """ + + def __init__(self, kafka_hosts, produce_topic=None, consume_topic=None, api_host_url=None): + if api_host_url: + conf = fatcat_client.Configuration() + conf.host = api_host_url + self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf)) + self.kafka = KafkaClient(hosts=kafka_hosts, broker_version="1.0.0") + self.produce_topic = produce_topic + self.consume_topic = consume_topic + diff --git a/python/fatcat_worker.py b/python/fatcat_worker.py new file mode 100755 index 00000000..cc11beca --- /dev/null +++ b/python/fatcat_worker.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 + +import sys +import argparse +from fatcat.changelog_workers import FatcatChangelogWorker, FatcatEntityUpdatesWorker + +def run_changelog_worker(args): + topic = "fatcat-{}.changelog".format(args.env) + worker = FatcatChangelogWorker(args.api_host_url, args.kafka_hosts, topic, + args.poll_interval) + worker.run() + +def run_entity_updates_worker(args): + changelog_topic = "fatcat-{}.changelog".format(args.env) + release_topic = "fatcat-{}.release-updates".format(args.env) + worker = FatcatEntityUpdatesWorker(args.api_host_url, args.kafka_hosts, + changelog_topic, release_topic) + worker.run() + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--debug', + action='store_true', + help="enable debug logging") + parser.add_argument('--api-host-url', + default="http://localhost:9411/v0", + help="fatcat API host/port to use") + parser.add_argument('--kafka-hosts', + default="localhost:9092", + help="list of Kafka brokers (host/port) to use") + parser.add_argument('--env', + default="qa", + help="Kafka topic namespace to use (eg, prod, qa)") + subparsers = parser.add_subparsers() + + sub_changelog = subparsers.add_parser('changelog') + sub_changelog.set_defaults(func=run_changelog_worker) + sub_changelog.add_argument('--poll-interval', + help="how long to wait between polling (seconds)", + default=10.0, type=float) + + sub_entity_updates = subparsers.add_parser('entity-updates') + sub_entity_updates.set_defaults(func=run_entity_updates_worker) + + args = parser.parse_args() + if not args.__dict__.get("func"): + print("tell me what to do!") + sys.exit(-1) + args.func(args) + +if __name__ == '__main__': + main() -- cgit v1.2.3