summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/harvest
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-09-19 20:00:24 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-09-20 11:21:31 -0700
commit90b5cb354d7d73c920288394aa9fd8d58e752157 (patch)
tree204085442b967fde2b8ad7ad46f521f3e3b834eb /python/fatcat_tools/harvest
parent80dc9bab9c6e40cdde95f9e9c7fad13ca64b0769 (diff)
downloadfatcat-90b5cb354d7d73c920288394aa9fd8d58e752157.tar.gz
fatcat-90b5cb354d7d73c920288394aa9fd8d58e752157.zip
review/fix all confluent-kafka produce code
Diffstat (limited to 'python/fatcat_tools/harvest')
-rw-r--r--python/fatcat_tools/harvest/doi_registrars.py13
-rw-r--r--python/fatcat_tools/harvest/harvest_common.py37
-rw-r--r--python/fatcat_tools/harvest/oaipmh.py13
3 files changed, 49 insertions, 14 deletions
diff --git a/python/fatcat_tools/harvest/doi_registrars.py b/python/fatcat_tools/harvest/doi_registrars.py
index 3362df35..7e791745 100644
--- a/python/fatcat_tools/harvest/doi_registrars.py
+++ b/python/fatcat_tools/harvest/doi_registrars.py
@@ -56,11 +56,7 @@ class HarvestCrossrefWorker:
self.is_update_filter = is_update_filter
self.kafka_config = {
'bootstrap.servers': kafka_hosts,
- 'delivery.report.only.error': True,
'message.max.bytes': 20000000, # ~20 MBytes; broker is ~50 MBytes
- 'default.topic.config': {
- 'request.required.acks': 'all',
- },
}
self.state = HarvestState(start_date, end_date)
@@ -97,7 +93,14 @@ class HarvestCrossrefWorker:
# TODO: should it be sys.exit(-1)?
raise KafkaException(err)
- producer = Producer(self.kafka_config)
+ producer_conf = self.kafka_config.copy()
+ producer_conf.update({
+ 'delivery.report.only.error': True,
+ 'default.topic.config': {
+ 'request.required.acks': -1, # all brokers must confirm
+ },
+ })
+ producer = Producer(producer_conf)
date_str = date.isoformat()
params = self.params(date_str)
diff --git a/python/fatcat_tools/harvest/harvest_common.py b/python/fatcat_tools/harvest/harvest_common.py
index aa7a69f5..78830a1c 100644
--- a/python/fatcat_tools/harvest/harvest_common.py
+++ b/python/fatcat_tools/harvest/harvest_common.py
@@ -130,30 +130,56 @@ class HarvestState:
if err:
raise KafkaException(err)
print("Commiting status to Kafka: {}".format(kafka_topic))
- producer = Producer(kafka_config)
- producer.produce(kafka_topic, state_json, on_delivery=fail_fast)
+ producer_conf = kafka_config.copy()
+ producer_conf.update({
+ 'delivery.report.only.error': True,
+ 'default.topic.config': {
+ 'request.required.acks': -1, # all brokers must confirm
+ },
+ })
+ producer = Producer(producer_conf)
+ producer.produce(
+ kafka_topic,
+ state_json,
+ on_delivery=fail_fast)
producer.flush()
return state_json
def initialize_from_kafka(self, kafka_topic, kafka_config):
"""
kafka_topic should have type str
+
+ TODO: this method does not fail if client can't connect to host.
"""
if not kafka_topic:
return
print("Fetching state from kafka topic: {}".format(kafka_topic))
+ def fail_fast(err, msg):
+ if err:
+ raise KafkaException(err)
conf = kafka_config.copy()
conf.update({
+ 'group.id': 'dummy_init_group', # should never be commited
+ 'enable.auto.commit': False,
'auto.offset.reset': 'earliest',
'session.timeout.ms': 10000,
- 'group.id': kafka_topic + "-init",
})
consumer = Consumer(conf)
+
+ # this watermark fetch is mostly to ensure we are connected to broker and
+ # fail fast if not, but we also confirm that we read to end below.
+ hwm = consumer.get_watermark_offsets(
+ TopicPartition(kafka_topic, 0),
+ timeout=5.0,
+ cached=False)
+ if not hwm:
+ raise Exception("Kafka consumer timeout, or topic {} doesn't exist".format(kafka_topic))
+
consumer.assign([TopicPartition(kafka_topic, 0, 0)])
c = 0
while True:
- msg = consumer.poll(timeout=1.0)
+ msg = consumer.poll(timeout=2.0)
if not msg:
break
if msg.error():
@@ -162,4 +188,7 @@ class HarvestState:
self.update(msg.value().decode('utf-8'))
c += 1
consumer.close()
+
+ # verify that we got at least to HWM
+ assert c >= hwm[1]
print("... got {} state update messages, done".format(c))
diff --git a/python/fatcat_tools/harvest/oaipmh.py b/python/fatcat_tools/harvest/oaipmh.py
index 3e3bea03..f908ba83 100644
--- a/python/fatcat_tools/harvest/oaipmh.py
+++ b/python/fatcat_tools/harvest/oaipmh.py
@@ -39,10 +39,7 @@ class HarvestOaiPmhWorker:
self.state_topic = state_topic
self.kafka_config = {
'bootstrap.servers': kafka_hosts,
- 'delivery.report.only.error': True,
'message.max.bytes': 20000000, # ~20 MBytes; broker is ~50 MBytes
- 'default.topic.config':
- {'request.required.acks': 'all'},
}
self.loop_sleep = 60*60 # how long to wait, in seconds, between date checks
@@ -62,7 +59,14 @@ class HarvestOaiPmhWorker:
# TODO: should it be sys.exit(-1)?
raise KafkaException(err)
- producer = Producer(self.kafka_config)
+ producer_conf = self.kafka_config.copy()
+ producer_conf.update({
+ 'delivery.report.only.error': True,
+ 'default.topic.config': {
+ 'request.required.acks': -1, # all brokers must confirm
+ },
+ })
+ producer = Producer(producer_conf)
api = sickle.Sickle(self.endpoint_url)
date_str = date.isoformat()
@@ -88,7 +92,6 @@ class HarvestOaiPmhWorker:
item.raw.encode('utf-8'),
key=item.header.identifier.encode('utf-8'),
on_delivery=fail_fast)
- producer.poll(0)
producer.flush()
def run(self, continuous=False):