aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/common.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-04-08 15:42:55 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-04-08 15:47:50 -0700
commitc6db48b47ac53d9ca05d2ce33f97fb73fb85131d (patch)
treec9738e64169fd889f892a35bb4b478de17201e8a /python/fatcat_tools/importers/common.py
parentd43ee5f73e1f7d7f6317d1e7dfb1ddd0e510a353 (diff)
downloadfatcat-c6db48b47ac53d9ca05d2ce33f97fb73fb85131d.tar.gz
fatcat-c6db48b47ac53d9ca05d2ce33f97fb73fb85131d.zip
convert importers to confluent-kafka library
Diffstat (limited to 'python/fatcat_tools/importers/common.py')
-rw-r--r--python/fatcat_tools/importers/common.py91
1 files changed, 72 insertions, 19 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index b89c3828..4648f6f3 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -7,7 +7,7 @@ import ftfy
import itertools
import subprocess
from collections import Counter
-import pykafka
+from confluent_kafka import Consumer, KafkaException
import fatcat_client
from fatcat_client.rest import ApiException
@@ -429,35 +429,88 @@ class KafkaJsonPusher(RecordPusher):
topic_suffix,
group,
)
+ self.edit_batch_size = kwargs.get('edit_batch_size', 100)
def run(self):
count = 0
- for msg in self.consumer:
- if not msg:
+ while True:
+ batch = self.consumer.consume(
+ num_messages=self.edit_batch_size,
+ timeout=3.0)
+ print("... got {} kafka messages".format(len(batch)))
+ if not batch:
+ # TODO: could have some larger timeout here and
+ # self.importer.finish() if it's been more than, eg, a couple
+ # minutes
continue
- record = json.loads(msg.value.decode('utf-8'))
- self.importer.push_record(record)
- count += 1
- if count % 500 == 0:
- print("Import counts: {}".format(self.importer.counts))
+ # first check errors on entire batch...
+ for msg in batch:
+ if msg.error():
+ raise KafkaException(msg.error())
+ # ... then process
+ for msg in batch:
+ record = json.loads(msg.value().decode('utf-8'))
+ self.importer.push_record(record)
+ count += 1
+ if count % 500 == 0:
+ print("Import counts: {}".format(self.importer.counts))
+ # locally store the last processed message; will be auto-commited
+ # from this "stored" value
+ assert msg
+ self.consumer.store_offsets(msg)
# TODO: should catch UNIX signals (HUP?) to shutdown cleanly, and/or
# commit the current batch if it has been lingering
counts = self.importer.finish()
print(counts)
+ self.consumer.close()
return counts
def make_kafka_consumer(hosts, env, topic_suffix, group):
- topic_name = "fatcat-{}.{}".format(env, topic_suffix).encode('utf-8')
- client = pykafka.KafkaClient(hosts=hosts, broker_version="1.0.0")
- consume_topic = client.topics[topic_name]
- print("Consuming from kafka topic {}, group {}".format(topic_name, group))
-
- consumer = consume_topic.get_balanced_consumer(
- consumer_group=group.encode('utf-8'),
- managed=True,
- auto_commit_enable=True,
- auto_commit_interval_ms=30000, # 30 seconds
- compacted_topic=True,
+ topic_name = "fatcat-{}.{}".format(env, topic_suffix)
+
+ def fail_fast(err, partitions):
+ if err is not None:
+ print("Kafka consumer commit error: {}".format(err))
+ print("Bailing out...")
+ # TODO: should it be sys.exit(-1)?
+ raise KafkaException(err)
+ for p in partitions:
+ # check for partition-specific commit errors
+ print(p)
+ if p.error:
+ print("Kafka consumer commit error: {}".format(p.error))
+ print("Bailing out...")
+ # TODO: should it be sys.exit(-1)?
+ raise KafkaException(err)
+ #print("Kafka consumer commit successful")
+ pass
+
+ # previously, using pykafka
+ #auto_commit_enable=True,
+ #auto_commit_interval_ms=30000, # 30 seconds
+ conf = {
+ 'bootstrap.servers': hosts,
+ 'group.id': group.encode('utf-8'),
+ 'on_commit': fail_fast,
+ 'delivery.report.only.error': True,
+ 'enable.auto.offset.store': False,
+ 'default.topic.config': {
+ 'request.required.acks': -1,
+ 'auto.offset.reset': 'latest',
+ },
+ }
+
+ def on_rebalance(consumer, partitions):
+ print("Kafka partitions rebalanced: {} / {}".format(
+ consumer, partitions))
+
+ consumer = Consumer(conf)
+ # NOTE: it's actually important that topic_name *not* be bytes (UTF-8
+ # encoded)
+ consumer.subscribe([topic_name],
+ on_assign=on_rebalance,
+ on_revoke=on_rebalance,
)
+ print("Consuming from kafka topic {}, group {}".format(topic_name, group))
return consumer