diff options
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r-- | python/fatcat_tools/__init__.py | 1 | ||||
-rw-r--r-- | python/fatcat_tools/importers/ingest.py | 7 | ||||
-rw-r--r-- | python/fatcat_tools/kafka.py | 22 |
3 files changed, 28 insertions, 2 deletions
diff --git a/python/fatcat_tools/__init__.py b/python/fatcat_tools/__init__.py index f2798f0b..78c5c90b 100644 --- a/python/fatcat_tools/__init__.py +++ b/python/fatcat_tools/__init__.py @@ -2,3 +2,4 @@ from .api_auth import authenticated_api, public_api from .fcid import fcid2uuid, uuid2fcid from .transforms import * +from .kafka import simple_kafka_producer, kafka_fail_fast diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 7dad13ce..deb4ef51 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -28,13 +28,16 @@ class IngestFileResultImporter(EntityImporter): print("Requiring GROBID status == 200") else: print("NOT checking GROBID success") - self.ingest_request_source_whitelist = ['fatcat-changelog'] + self.ingest_request_source_whitelist = [ + 'fatcat-changelog', + 'fatcat-ingest-container', + ] if kwargs.get('skip_source_whitelist', False): self.ingest_request_source_whitelist = [] def want(self, row): """ - Logic here probably needs work: + Logic here probably needs work (TODO): - Direct ingests via DOI from fatcat-changelog should probably go through regardless of GROBID status diff --git a/python/fatcat_tools/kafka.py b/python/fatcat_tools/kafka.py new file mode 100644 index 00000000..53b62a37 --- /dev/null +++ b/python/fatcat_tools/kafka.py @@ -0,0 +1,22 @@ + +from confluent_kafka import Consumer, Producer, KafkaException + + +def kafka_fail_fast(err, msg): + if err is not None: + print("Kafka producer delivery error: {}".format(err)) + print("Bailing out...") + # TODO: should it be sys.exit(-1)? + raise KafkaException(err) + +def simple_kafka_producer(kafka_hosts): + + kafka_config = { + 'bootstrap.servers': kafka_hosts, + 'message.max.bytes': 20000000, # ~20 MBytes; broker-side max is ~50 MBytes + 'delivery.report.only.error': True, + 'default.topic.config': { + 'request.required.acks': -1, + }, + } + return Producer(kafka_config) |