aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/workers
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/workers')
-rw-r--r--python/fatcat_tools/workers/changelog.py16
-rw-r--r--python/fatcat_tools/workers/elasticsearch.py22
2 files changed, 26 insertions, 12 deletions
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index c134bde2..8b1ba5e9 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -40,7 +40,14 @@ class ChangelogWorker(FatcatWorker):
# TODO: should it be sys.exit(-1)?
raise KafkaException(err)
- producer = Producer(self.kafka_config)
+ producer_conf = self.kafka_config.copy()
+ producer_conf.update({
+ 'delivery.report.only.error': True,
+ 'default.topic.config': {
+ 'request.required.acks': -1, # all brokers must confirm
+ },
+ })
+ producer = Producer(producer_conf)
while True:
latest = int(self.api.get_changelog(limit=1)[0].index)
@@ -58,7 +65,7 @@ class ChangelogWorker(FatcatWorker):
#NOTE timestamp could be timestamp=cle.timestamp (?)
)
self.offset = i
- producer.poll(0)
+ producer.flush()
print("Sleeping {} seconds...".format(self.poll_interval))
time.sleep(self.poll_interval)
@@ -118,7 +125,6 @@ class EntityUpdatesWorker(FatcatWorker):
consumer_conf.update({
'group.id': self.consumer_group,
'on_commit': fail_fast,
- 'delivery.report.only.error': True,
# messages don't have offset marked as stored until pushed to
# elastic, but we do auto-commit stored offsets to broker
'enable.auto.commit': True,
@@ -134,8 +140,9 @@ class EntityUpdatesWorker(FatcatWorker):
producer_conf = self.kafka_config.copy()
producer_conf.update({
+ 'delivery.report.only.error': True,
'default.topic.config': {
- 'request.required.acks': -1,
+ 'request.required.acks': -1, # all brokers must confirm
},
})
producer = Producer(producer_conf)
@@ -207,6 +214,7 @@ class EntityUpdatesWorker(FatcatWorker):
key=ident.encode('utf-8'),
on_delivery=fail_fast,
)
+ producer.flush()
# TODO: actually update works
consumer.store_offsets(message=msg)
diff --git a/python/fatcat_tools/workers/elasticsearch.py b/python/fatcat_tools/workers/elasticsearch.py
index acb705c2..2ba241eb 100644
--- a/python/fatcat_tools/workers/elasticsearch.py
+++ b/python/fatcat_tools/workers/elasticsearch.py
@@ -2,7 +2,7 @@
import json
import time
import requests
-from confluent_kafka import Consumer, Producer, KafkaException
+from confluent_kafka import Consumer, KafkaException
from fatcat_openapi_client import ReleaseEntity, ContainerEntity, ApiClient
from fatcat_tools import *
@@ -61,7 +61,6 @@ class ElasticsearchReleaseWorker(FatcatWorker):
consumer_conf.update({
'group.id': self.consumer_group,
'on_commit': fail_fast,
- 'delivery.report.only.error': True,
# messages don't have offset marked as stored until pushed to
# elastic, but we do auto-commit stored offsets to broker
'enable.auto.commit': True,
@@ -97,16 +96,24 @@ class ElasticsearchReleaseWorker(FatcatWorker):
bulk_actions = []
for msg in batch:
json_str = msg.value().decode('utf-8')
- entity = entity_from_json(json_str, ReleaseEntity, api_client=ac)
- print("Upserting: release/{}".format(entity.ident))
+ # HACK: work around a bug where container entities got published to
+ # release_v03 topic
+ if self.elasticsearch_document_name == "release":
+ entity_dict = json.loads(json_str)
+ if entity_dict.get('name') and not entity_dict.get('title'):
+ continue
+ entity = entity_from_json(json_str, self.entity_type, api_client=ac)
+ # TODO: handle deletions from index
bulk_actions.append(json.dumps({
"index": { "_id": entity.ident, },
}))
bulk_actions.append(json.dumps(
- release_to_elasticsearch(entity)))
- elasticsearch_endpoint = "{}/{}/release/_bulk".format(
+ self.transform_func(entity)))
+ print("Upserting, eg, {} (of {} releases in elasticsearch)".format(entity.ident, len(batch)))
+ elasticsearch_endpoint = "{}/{}/{}/_bulk".format(
self.elasticsearch_backend,
- self.elasticsearch_index)
+ self.elasticsearch_index,
+ self.elasticsearch_document_name)
resp = requests.post(elasticsearch_endpoint,
headers={"Content-Type": "application/x-ndjson"},
data="\n".join(bulk_actions) + "\n")
@@ -141,4 +148,3 @@ class ElasticsearchContainerWorker(ElasticsearchReleaseWorker):
self.elasticsearch_document_name = "container"
self.transform_func = container_to_elasticsearch
-