aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/__init__.py1
-rw-r--r--python/fatcat_tools/importers/ingest.py7
-rw-r--r--python/fatcat_tools/kafka.py22
3 files changed, 28 insertions, 2 deletions
diff --git a/python/fatcat_tools/__init__.py b/python/fatcat_tools/__init__.py
index f2798f0b..78c5c90b 100644
--- a/python/fatcat_tools/__init__.py
+++ b/python/fatcat_tools/__init__.py
@@ -2,3 +2,4 @@
from .api_auth import authenticated_api, public_api
from .fcid import fcid2uuid, uuid2fcid
from .transforms import *
+from .kafka import simple_kafka_producer, kafka_fail_fast
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index 7dad13ce..deb4ef51 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -28,13 +28,16 @@ class IngestFileResultImporter(EntityImporter):
print("Requiring GROBID status == 200")
else:
print("NOT checking GROBID success")
- self.ingest_request_source_whitelist = ['fatcat-changelog']
+ self.ingest_request_source_whitelist = [
+ 'fatcat-changelog',
+ 'fatcat-ingest-container',
+ ]
if kwargs.get('skip_source_whitelist', False):
self.ingest_request_source_whitelist = []
def want(self, row):
"""
- Logic here probably needs work:
+ Logic here probably needs work (TODO):
- Direct ingests via DOI from fatcat-changelog should probably go
through regardless of GROBID status
diff --git a/python/fatcat_tools/kafka.py b/python/fatcat_tools/kafka.py
new file mode 100644
index 00000000..53b62a37
--- /dev/null
+++ b/python/fatcat_tools/kafka.py
@@ -0,0 +1,22 @@
+
+from confluent_kafka import Consumer, Producer, KafkaException
+
+
+def kafka_fail_fast(err, msg):
+ if err is not None:
+ print("Kafka producer delivery error: {}".format(err))
+ print("Bailing out...")
+ # TODO: should it be sys.exit(-1)?
+ raise KafkaException(err)
+
+def simple_kafka_producer(kafka_hosts):
+
+ kafka_config = {
+ 'bootstrap.servers': kafka_hosts,
+ 'message.max.bytes': 20000000, # ~20 MBytes; broker-side max is ~50 MBytes
+ 'delivery.report.only.error': True,
+ 'default.topic.config': {
+ 'request.required.acks': -1,
+ },
+ }
+ return Producer(kafka_config)