summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
authorbnewbold <bnewbold@archive.org>2019-12-12 18:52:46 +0000
committerbnewbold <bnewbold@archive.org>2019-12-12 18:52:46 +0000
commit374ed6ccac6191461616ac3df85daf3a3a9ab2ed (patch)
treec3cd2e912eaa97ada104303a9fb5c857e60a3a33 /python/fatcat_tools
parent7831f78cc9ccef7331c9176dbecb34f8afc9b54f (diff)
parent125c93748920c7a213a5bb572e19b923a3587f8b (diff)
downloadfatcat-374ed6ccac6191461616ac3df85daf3a3a9ab2ed.tar.gz
fatcat-374ed6ccac6191461616ac3df85daf3a3a9ab2ed.zip
Merge branch 'bnewbold-ingest-oa-container' into 'master'
container-ingest tool See merge request webgroup/fatcat!8
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/__init__.py1
-rw-r--r--python/fatcat_tools/importers/ingest.py7
-rw-r--r--python/fatcat_tools/kafka.py22
3 files changed, 28 insertions, 2 deletions
diff --git a/python/fatcat_tools/__init__.py b/python/fatcat_tools/__init__.py
index f2798f0b..78c5c90b 100644
--- a/python/fatcat_tools/__init__.py
+++ b/python/fatcat_tools/__init__.py
@@ -2,3 +2,4 @@
from .api_auth import authenticated_api, public_api
from .fcid import fcid2uuid, uuid2fcid
from .transforms import *
+from .kafka import simple_kafka_producer, kafka_fail_fast
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index 7dad13ce..deb4ef51 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -28,13 +28,16 @@ class IngestFileResultImporter(EntityImporter):
print("Requiring GROBID status == 200")
else:
print("NOT checking GROBID success")
- self.ingest_request_source_whitelist = ['fatcat-changelog']
+ self.ingest_request_source_whitelist = [
+ 'fatcat-changelog',
+ 'fatcat-ingest-container',
+ ]
if kwargs.get('skip_source_whitelist', False):
self.ingest_request_source_whitelist = []
def want(self, row):
"""
- Logic here probably needs work:
+ Logic here probably needs work (TODO):
- Direct ingests via DOI from fatcat-changelog should probably go
through regardless of GROBID status
diff --git a/python/fatcat_tools/kafka.py b/python/fatcat_tools/kafka.py
new file mode 100644
index 00000000..53b62a37
--- /dev/null
+++ b/python/fatcat_tools/kafka.py
@@ -0,0 +1,22 @@
+
+from confluent_kafka import Consumer, Producer, KafkaException
+
+
+def kafka_fail_fast(err, msg):
+ if err is not None:
+ print("Kafka producer delivery error: {}".format(err))
+ print("Bailing out...")
+ # TODO: should it be sys.exit(-1)?
+ raise KafkaException(err)
+
+def simple_kafka_producer(kafka_hosts):
+
+ kafka_config = {
+ 'bootstrap.servers': kafka_hosts,
+ 'message.max.bytes': 20000000, # ~20 MBytes; broker-side max is ~50 MBytes
+ 'delivery.report.only.error': True,
+ 'default.topic.config': {
+ 'request.required.acks': -1,
+ },
+ }
+ return Producer(kafka_config)