From c12148851e26c14b38ec6cadbe2322829fde23e6 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 20 Nov 2018 15:09:43 -0800 Subject: initial work on kafka_grobid worker --- kafka/grobid_kafka_notes.txt | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kafka') diff --git a/kafka/grobid_kafka_notes.txt b/kafka/grobid_kafka_notes.txt index f774291..26c450f 100644 --- a/kafka/grobid_kafka_notes.txt +++ b/kafka/grobid_kafka_notes.txt @@ -22,3 +22,21 @@ this... Need to ensure we have compression enabled, for the GROBID output in particular! Probably worth using "expensive" GZIP compression to get extra disk savings; latency shouldn't be a big deal here. + +## Commands + +Load up some example lines, without partition key: + + head -n10 python/tests/files/example_ungrobided.tsv | kafkacat -P -b localhost:9092 -t sandcrawler-qa.ungrobided + +Load up some example lines, with partition key: + + head -n10 python/tests/files/example_ungrobided.tsv | awk -F'\t' '{print $1 "\t" $0}' | kafkacat -K$'\t' -P -b localhost:9092 -t sandcrawler-qa.ungrobided + +Check ungrobided topic: + + kafkacat -C -b localhost:9092 -t sandcrawler-qa.ungrobided + +Check grobid output: + + kafkacat -C -b localhost:9092 -t sandcrawler-qa.grobided -- cgit v1.2.3