diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-06-16 17:10:44 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-16 17:10:44 -0700 |
commit | 9df71395046d045d7f8b568a55de4ea000de8791 (patch) | |
tree | ca160903d63dca48a02706447a9003e7f369416d /python | |
parent | b839dcb734805397b8bf611eb77942b9555f4915 (diff) | |
download | sandcrawler-9df71395046d045d7f8b568a55de4ea000de8791.tar.gz sandcrawler-9df71395046d045d7f8b568a55de4ea000de8791.zip |
rename KafkaGrobidSink -> KafkaCompressSink
Diffstat (limited to 'python')
-rwxr-xr-x | python/grobid_tool.py | 2 | ||||
-rw-r--r-- | python/sandcrawler/__init__.py | 2 | ||||
-rw-r--r-- | python/sandcrawler/workers.py | 2 |
3 files changed, 3 insertions, 3 deletions
diff --git a/python/grobid_tool.py b/python/grobid_tool.py index a2d74a1..fe507a0 100755 --- a/python/grobid_tool.py +++ b/python/grobid_tool.py @@ -140,7 +140,7 @@ def main(): if args.kafka_mode: produce_topic = "sandcrawler-{}.grobid-output-pg".format(args.kafka_env) print("Running in kafka output mode, publishing to {}\n".format(produce_topic)) - args.sink = KafkaGrobidSink(kafka_hosts=args.kafka_hosts, + args.sink = KafkaCompressSink(kafka_hosts=args.kafka_hosts, produce_topic=produce_topic) args.func(args) diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py index 492b558..654df35 100644 --- a/python/sandcrawler/__init__.py +++ b/python/sandcrawler/__init__.py @@ -2,7 +2,7 @@ from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker from .pdftrio import PdfTrioClient, PdfTrioWorker, PdfTrioBlobWorker from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url -from .workers import KafkaSink, KafkaGrobidSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper +from .workers import KafkaSink, KafkaCompressSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow from .ingest import IngestFileWorker from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker, PersistPdfTrioWorker, PersistIngestRequestWorker diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py index 6425e99..a42b1a4 100644 --- a/python/sandcrawler/workers.py +++ b/python/sandcrawler/workers.py @@ -217,7 +217,7 @@ class KafkaSink(SandcrawlerWorker): return self.counts -class KafkaGrobidSink(KafkaSink): +class KafkaCompressSink(KafkaSink): """ Variant of KafkaSink for large documents. Used for, eg, GROBID output. """ |