small fixes to confluent-kafka importers/workers

- decrease default changelog pipeline to 5.0sec - fix missing KafkaException harvester imports - more confluent-kafka tweaks - updates to kafka consumer configs - bump elastic updates consumergroup (again)
author: Bryan Newbold <bnewbold@robocracy.org> 2019-09-19 17:11:47 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2019-09-20 11:21:11 -0700
commit: 80dc9bab9c6e40cdde95f9e9c7fad13ca64b0769 (patch)
tree: 61e14bd3a21bd3dbf70a8a54c488cf09c7c2f11f /python/fatcat_tools/importers
parent: 6183e95e9739a6fbf0d8cd77603d075e87804abb (diff)
download: fatcat-80dc9bab9c6e40cdde95f9e9c7fad13ca64b0769.tar.gz
fatcat-80dc9bab9c6e40cdde95f9e9c7fad13ca64b0769.zip
1 files changed, 24 insertions, 10 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 32bb210a..42fe38aa 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -703,14 +703,23 @@ class KafkaJsonPusher(RecordPusher):
             topic_suffix,
             group,
         )
+        self.poll_interval = kwargs.get('poll_interval', 5.0)
+        self.consume_batch_size = kwargs.get('consume_batch_size', 100)
 
     def run(self):
         count = 0
         while True:
+            # TODO: this is batch-oriented, because underlying importer is
+            # often batch-oriented, but this doesn't confirm that entire batch
+            # has been pushed to fatcat before commiting offset. Eg, consider
+            # case where there there is one update and thousands of creates;
+            # update would be lingering in importer, and if importer crashed
+            # never created. Not great.
             batch = self.consumer.consume(
-                num_messages=self.edit_batch_size,
-                timeout=3.0)
-            print("... got {} kafka messages".format(len(batch)))
+                num_messages=self.consume_batch_size,
+                timeout=self.poll_interval)
+            print("... got {} kafka messages ({}sec poll interval)".format(
+                len(batch), self.poll_interval))
             if not batch:
                 # TODO: could have some larger timeout here and
                 # self.importer.finish() if it's been more than, eg, a couple
@@ -727,10 +736,11 @@ class KafkaJsonPusher(RecordPusher):
                 count += 1
                 if count % 500 == 0:
                     print("Import counts: {}".format(self.importer.counts))
-            # locally store the last processed message; will be auto-commited
-            # from this "stored" value
-            assert msg
-            self.consumer.store_offsets(msg)
+            for msg in batch:
+                # locally store offsets of processed messages; will be
+                # auto-commited by librdkafka from this "stored" value
+                self.consumer.store_offsets(message=msg)
+
         # TODO: should catch UNIX signals (HUP?) to shutdown cleanly, and/or
         # commit the current batch if it has been lingering
         counts = self.importer.finish()
@@ -750,7 +760,6 @@ def make_kafka_consumer(hosts, env, topic_suffix, group):
             raise KafkaException(err)
         for p in partitions:
             # check for partition-specific commit errors
-            print(p)
             if p.error:
                 print("Kafka consumer commit error: {}".format(p.error))
                 print("Bailing out...")
@@ -764,12 +773,17 @@ def make_kafka_consumer(hosts, env, topic_suffix, group):
     #auto_commit_interval_ms=30000, # 30 seconds
     conf = {
         'bootstrap.servers': hosts,
-        'group.id': group.encode('utf-8'),
+        'group.id': group,
         'on_commit': fail_fast,
         'delivery.report.only.error': True,
+        # messages don't have offset marked as stored until pushed to
+        # elastic, but we do auto-commit stored offsets to broker
         'enable.auto.offset.store': False,
+        'enable.auto.commit': True,
+        # user code timeout; if no poll after this long, assume user code
+        # hung and rebalance (default: 5min)
+        'max.poll.interval.ms': 120000,
         'default.topic.config': {
-            'request.required.acks': -1,
             'auto.offset.reset': 'latest',
         },
     }
author	Bryan Newbold <bnewbold@robocracy.org>	2019-09-19 17:11:47 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2019-09-20 11:21:11 -0700
commit	80dc9bab9c6e40cdde95f9e9c7fad13ca64b0769 (patch)
tree	61e14bd3a21bd3dbf70a8a54c488cf09c7c2f11f /python/fatcat_tools/importers
parent	6183e95e9739a6fbf0d8cd77603d075e87804abb (diff)
download	fatcat-80dc9bab9c6e40cdde95f9e9c7fad13ca64b0769.tar.gz fatcat-80dc9bab9c6e40cdde95f9e9c7fad13ca64b0769.zip