aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-16 17:10:44 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-16 17:10:44 -0700
commit9df71395046d045d7f8b568a55de4ea000de8791 (patch)
treeca160903d63dca48a02706447a9003e7f369416d
parentb839dcb734805397b8bf611eb77942b9555f4915 (diff)
downloadsandcrawler-9df71395046d045d7f8b568a55de4ea000de8791.tar.gz
sandcrawler-9df71395046d045d7f8b568a55de4ea000de8791.zip
rename KafkaGrobidSink -> KafkaCompressSink
-rwxr-xr-xpython/grobid_tool.py2
-rw-r--r--python/sandcrawler/__init__.py2
-rw-r--r--python/sandcrawler/workers.py2
3 files changed, 3 insertions, 3 deletions
diff --git a/python/grobid_tool.py b/python/grobid_tool.py
index a2d74a1..fe507a0 100755
--- a/python/grobid_tool.py
+++ b/python/grobid_tool.py
@@ -140,7 +140,7 @@ def main():
if args.kafka_mode:
produce_topic = "sandcrawler-{}.grobid-output-pg".format(args.kafka_env)
print("Running in kafka output mode, publishing to {}\n".format(produce_topic))
- args.sink = KafkaGrobidSink(kafka_hosts=args.kafka_hosts,
+ args.sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts,
produce_topic=produce_topic)
args.func(args)
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py
index 492b558..654df35 100644
--- a/python/sandcrawler/__init__.py
+++ b/python/sandcrawler/__init__.py
@@ -2,7 +2,7 @@
from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker
from .pdftrio import PdfTrioClient, PdfTrioWorker, PdfTrioBlobWorker
from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url
-from .workers import KafkaSink, KafkaGrobidSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper
+from .workers import KafkaSink, KafkaCompressSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper
from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow
from .ingest import IngestFileWorker
from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker, PersistPdfTrioWorker, PersistIngestRequestWorker
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index 6425e99..a42b1a4 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -217,7 +217,7 @@ class KafkaSink(SandcrawlerWorker):
return self.counts
-class KafkaGrobidSink(KafkaSink):
+class KafkaCompressSink(KafkaSink):
"""
Variant of KafkaSink for large documents. Used for, eg, GROBID output.
"""