From 2fdba24da0e0bf3d300cfb959514bf57a3cf6701 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 4 Nov 2020 17:07:46 -0800 Subject: kafka: new XML+HTML topics --- kafka/topics.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/kafka/topics.md b/kafka/topics.md index ebe7a61..fa1bd6d 100644 --- a/kafka/topics.md +++ b/kafka/topics.md @@ -59,6 +59,18 @@ retention (on both a size and time basis). => 12 partitions => key is sha1hex of PDF; enable key compaction; gzip compression + sandcrawler-ENV.xml-doc + => fulltext XML; mostly JATS XML + => schema is JSON, with 'jats_xml' field containing the XML as a string + => 6 partitions + => key is sha1hex of XML document; enable key compaction; gzip compression + + sandcrawler-ENV.html-teixml + => extracted fulltext from HTML; mostly TEI-XML + => schema is JSON, with 'tei_xml' field containing the XML as a string + => 6 partitions + => key is sha1hex of source HTML document; enable key compaction; gzip compression + sandcrawler-ENV.pdf-thumbnail-SIZE-TYPE => thumbnail images (eg, png, jpg) from PDFs => raw bytes in message (no JSON or other wrapping). fields average 10 KByte @@ -175,3 +187,6 @@ exists`; this seems safe, and the settings won't be over-ridden. ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic sandcrawler-qa.pdf-thumbnail-180px-jpg --config cleanup.policy=compact ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 24 --topic sandcrawler-qa.unextracted + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.xml-doc --config compression.type=gzip --config cleanup.policy=compact + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.html-teixml --config compression.type=gzip --config cleanup.policy=compact + -- cgit v1.2.3