aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2018-11-04 17:26:28 -0800
committerBryan Newbold <bnewbold@robocracy.org>2018-11-04 17:26:28 -0800
commit881b46e3b1682974f48fc196f483c3fa2648b998 (patch)
treeba73e463482b8a125d92ef1942cbaa1f19f535f6
parent87b8f1c710c7c94ed9c1b040f75c1601591a214b (diff)
downloadfatcat-881b46e3b1682974f48fc196f483c3fa2648b998.tar.gz
fatcat-881b46e3b1682974f48fc196f483c3fa2648b998.zip
first-draft kafka workers (changelog, release_update)
-rw-r--r--python/fatcat/changelog_workers.py120
-rw-r--r--python/fatcat/worker_common.py25
-rwxr-xr-xpython/fatcat_worker.py52
3 files changed, 197 insertions, 0 deletions
diff --git a/python/fatcat/changelog_workers.py b/python/fatcat/changelog_workers.py
new file mode 100644
index 00000000..5f8621cf
--- /dev/null
+++ b/python/fatcat/changelog_workers.py
@@ -0,0 +1,120 @@
+
+import json
+import time
+from itertools import islice
+from fatcat.worker_common import FatcatWorker
+from pykafka.common import OffsetType
+
+
+class FatcatChangelogWorker(FatcatWorker):
+ """
+ Periodically polls the fatcat API looking for new changelogs. When they are
+ found, fetch them and push (as JSON) into a Kafka topic.
+ """
+
+ def __init__(self, api_host_url, kafka_hosts, produce_topic, poll_interval=10.0, offset=None):
+ # TODO: should be offset=0
+ super().__init__(kafka_hosts=kafka_hosts,
+ produce_topic=produce_topic,
+ api_host_url=api_host_url)
+ self.poll_interval = poll_interval
+ self.offset = offset # the fatcat changelog offset, not the kafka offset
+
+ def most_recent_message(self, topic):
+ """
+ Tries to fetch the most recent message from a given topic.
+ This only makes sense for single partition topics, though could be
+ extended with "last N" behavior.
+
+ Following "Consuming the last N messages from a topic"
+ from https://pykafka.readthedocs.io/en/latest/usage.html#consumer-patterns
+ """
+ consumer = topic.get_simple_consumer(
+ auto_offset_reset=OffsetType.LATEST,
+ reset_offset_on_start=True)
+ offsets = [(p, op.last_offset_consumed - 1)
+ for p, op in consumer._partitions.items()]
+ offsets = [(p, (o if o > -1 else -2)) for p, o in offsets]
+ if -2 in [o for p, o in offsets]:
+ return None
+ else:
+ consumer.reset_offsets(offsets)
+ msg = islice(consumer, 1)
+ if msg:
+ return list(msg)[0].value
+ else:
+ return None
+
+ def run(self):
+ topic = self.kafka.topics[self.produce_topic]
+ # On start, try to consume the most recent from the topic, and using
+ # that as the starting offset. Note that this is a single-partition
+ # topic
+ if self.offset is None:
+ print("Checking for most recent changelog offset...")
+ msg = self.most_recent_message(topic)
+ if msg:
+ self.offset = json.loads(msg.decode('utf-8'))['index']
+ else:
+ self.offset = 1
+
+ with topic.get_sync_producer() as producer:
+ while True:
+ latest = int(self.api.get_changelog(limit=1)[0].index)
+ if latest > self.offset:
+ print("Fetching changelogs from {} through {}".format(
+ self.offset+1, latest))
+ for i in range(self.offset+1, latest+1):
+ cle = self.api.get_changelog_entry(i)
+ obj = self.api.api_client.sanitize_for_serialization(cle)
+ producer.produce(
+ message=json.dumps(obj).encode('utf-8'),
+ partition_key=None,
+ timestamp=None,
+ #XXX: timestamp=cle.timestamp,
+ )
+ self.offset = i
+ print("Sleeping {} seconds...".format(self.poll_interval))
+ time.sleep(self.poll_interval)
+
+
+class FatcatEntityUpdatesWorker(FatcatWorker):
+ """
+ Consumes from the changelog topic and publishes expanded entities (fetched
+ from API) to update topics.
+
+ For now, only release updates are published.
+ """
+
+ def __init__(self, api_host_url, kafka_hosts, consume_topic, release_topic):
+ super().__init__(kafka_hosts=kafka_hosts,
+ consume_topic=consume_topic,
+ api_host_url=api_host_url)
+ self.release_topic = release_topic
+ self.consumer_group = "entity-updates"
+
+ def run(self):
+ changelog_topic = self.kafka.topics[self.consume_topic]
+ release_topic = self.kafka.topics[self.release_topic]
+
+ consumer = changelog_topic.get_simple_consumer(
+ consumer_group=self.consumer_group,
+ auto_offset_reset=OffsetType.LATEST,
+ reset_offset_on_start=False,
+ )
+
+ with release_topic.get_sync_producer() as producer:
+ for msg in consumer:
+ cle = json.loads(msg.value.decode('utf-8'))
+ #print(cle)
+ release_edits = cle['editgroup']['edits']['releases']
+ for re in release_edits:
+ ident = re['ident']
+ release = self.api.get_release(ident, expand="files,container")
+ release_dict = self.api.api_client.sanitize_for_serialization(release)
+ producer.produce(
+ message=json.dumps(release_dict).encode('utf-8'),
+ partition_key=ident.encode('utf-8'),
+ timestamp=None,
+ )
+
diff --git a/python/fatcat/worker_common.py b/python/fatcat/worker_common.py
new file mode 100644
index 00000000..77ea2c15
--- /dev/null
+++ b/python/fatcat/worker_common.py
@@ -0,0 +1,25 @@
+
+import re
+import sys
+import csv
+import json
+import itertools
+import fatcat_client
+from pykafka import KafkaClient
+from fatcat_client.rest import ApiException
+
+
+class FatcatWorker:
+ """
+ Common code for for Kafka producers and consumers.
+ """
+
+ def __init__(self, kafka_hosts, produce_topic=None, consume_topic=None, api_host_url=None):
+ if api_host_url:
+ conf = fatcat_client.Configuration()
+ conf.host = api_host_url
+ self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf))
+ self.kafka = KafkaClient(hosts=kafka_hosts, broker_version="1.0.0")
+ self.produce_topic = produce_topic
+ self.consume_topic = consume_topic
+
diff --git a/python/fatcat_worker.py b/python/fatcat_worker.py
new file mode 100755
index 00000000..cc11beca
--- /dev/null
+++ b/python/fatcat_worker.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+
+import sys
+import argparse
+from fatcat.changelog_workers import FatcatChangelogWorker, FatcatEntityUpdatesWorker
+
+def run_changelog_worker(args):
+ topic = "fatcat-{}.changelog".format(args.env)
+ worker = FatcatChangelogWorker(args.api_host_url, args.kafka_hosts, topic,
+ args.poll_interval)
+ worker.run()
+
+def run_entity_updates_worker(args):
+ changelog_topic = "fatcat-{}.changelog".format(args.env)
+ release_topic = "fatcat-{}.release-updates".format(args.env)
+ worker = FatcatEntityUpdatesWorker(args.api_host_url, args.kafka_hosts,
+ changelog_topic, release_topic)
+ worker.run()
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--debug',
+ action='store_true',
+ help="enable debug logging")
+ parser.add_argument('--api-host-url',
+ default="http://localhost:9411/v0",
+ help="fatcat API host/port to use")
+ parser.add_argument('--kafka-hosts',
+ default="localhost:9092",
+ help="list of Kafka brokers (host/port) to use")
+ parser.add_argument('--env',
+ default="qa",
+ help="Kafka topic namespace to use (eg, prod, qa)")
+ subparsers = parser.add_subparsers()
+
+ sub_changelog = subparsers.add_parser('changelog')
+ sub_changelog.set_defaults(func=run_changelog_worker)
+ sub_changelog.add_argument('--poll-interval',
+ help="how long to wait between polling (seconds)",
+ default=10.0, type=float)
+
+ sub_entity_updates = subparsers.add_parser('entity-updates')
+ sub_entity_updates.set_defaults(func=run_entity_updates_worker)
+
+ args = parser.parse_args()
+ if not args.__dict__.get("func"):
+ print("tell me what to do!")
+ sys.exit(-1)
+ args.func(args)
+
+if __name__ == '__main__':
+ main()