From 16f567d88cca7e79c36e4c06205861c7fe70bfa7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 21 Nov 2018 22:07:26 -0800 Subject: more kafka/grobid notes --- kafka/grobid_kafka_notes.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kafka') diff --git a/kafka/grobid_kafka_notes.txt b/kafka/grobid_kafka_notes.txt index 0e565aa..d8bb171 100644 --- a/kafka/grobid_kafka_notes.txt +++ b/kafka/grobid_kafka_notes.txt @@ -40,3 +40,15 @@ Check ungrobided topic: Check grobid output: kafkacat -C -b localhost:9092 -t sandcrawler-qa.grobid-output + +## Performance + +On 2018-11-21, using grobid-vm (svc096) with 30 cores, and running with 50x +kafka-grobid-worker processes (using systemd parallelization), achieved: + +- 2044 PDFs extracted in 197 seconds, or about 10/second +- that's about 28 hours to process 1 million PDFs + +I think this is about all the single machine can handle. To get more throughput +with multiple machines, might need to tweak worker to use a worker thread-pool +or some other concurrent pattern (async?). -- cgit v1.2.3