diff options
-rw-r--r-- | proposals/2020_pdf_meta_thumbnails.md | 7 | ||||
-rw-r--r-- | python/sandcrawler/persist.py | 3 | ||||
-rwxr-xr-x | python/sandcrawler_worker.py | 19 |
3 files changed, 16 insertions, 13 deletions
diff --git a/proposals/2020_pdf_meta_thumbnails.md b/proposals/2020_pdf_meta_thumbnails.md index d7578cb..eacbfa5 100644 --- a/proposals/2020_pdf_meta_thumbnails.md +++ b/proposals/2020_pdf_meta_thumbnails.md @@ -22,7 +22,7 @@ against the existing SQL table to avoid duplication of processing. ## PDF Metadata and Text -Kafka topic (name: `sandcrawler-ENV.pdftext`; 12x partitions; gzip +Kafka topic (name: `sandcrawler-ENV.pdf-text`; 12x partitions; gzip compression) JSON schema: sha1hex (string; used as key) @@ -73,8 +73,9 @@ Kafka, and we don't want SQL table size to explode. Schema: Kafka Schema is raw image bytes as message body; sha1sum of PDF as the key. No compression, 12x partitions. -Topic name is `sandcrawler-ENV.thumbnail-SIZE-png`. Thus, topic name contains -the "metadata" of thumbail size/shape. +Kafka topic name is `sandcrawler-ENV.pdf-thumbnail-SIZE-TYPE` (eg, +`sandcrawler-qa.pdf-thumbnail-180px-jpg`). Thus, topic name contains the +"metadata" of thumbail size/shape. Have decided to use JPEG thumbnails, 180px wide (and max 300px high, though width restriction is almost always the limiting factor). This size matches that diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 196c4b9..8d421ad 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -452,6 +452,7 @@ class PersistThumbnailWorker(SandcrawlerWorker): default_bucket=kwargs['s3_bucket'], ) self.s3_extension = kwargs.get('s3_extension', ".jpg") + self.s3_folder = kwargs.get('s3_folder', "pdf") def process(self, blob, key=None): """ @@ -463,7 +464,7 @@ class PersistThumbnailWorker(SandcrawlerWorker): assert len(blob) >= 50 resp = self.s3.put_blob( - folder="thumbnail", + folder=self.s3_folder, blob=blob, sha1hex=key, extension=self.s3_extension, diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py index 950eb4b..e18d883 100755 --- a/python/sandcrawler_worker.py +++ b/python/sandcrawler_worker.py @@ -71,9 +71,9 @@ def run_persist_grobid(args): pusher.run() def run_pdf_extract(args): - consume_topic = "sandcrawler-{}.unextracted-pg".format(args.env) - text_topic = "sandcrawler-{}.pdftext".format(args.env) - thumbnail_topic = "sandcrawler-{}.thumbnail-180px-jpeg".format(args.env) + consume_topic = "sandcrawler-{}.unextracted".format(args.env) + text_topic = "sandcrawler-{}.pdf-text".format(args.env) + thumbnail_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.env) text_sink = KafkaSink( kafka_hosts=args.kafka_hosts, produce_topic=text_topic, @@ -100,7 +100,7 @@ def run_pdf_extract(args): pusher.run() def run_persist_pdftext(args): - consume_topic = "sandcrawler-{}.pdftext".format(args.env) + consume_topic = "sandcrawler-{}.pdf-text".format(args.env) worker = PersistPdfTextWorker( db_url=args.db_url, s3_url=args.s3_url, @@ -114,27 +114,28 @@ def run_persist_pdftext(args): worker=worker, kafka_hosts=args.kafka_hosts, consume_topic=consume_topic, - group="persist-pdftext", + group="persist-pdf-text", push_batches=True, batch_size=25, ) pusher.run() def run_persist_thumbnail(args): - consume_topic = "sandcrawler-{}.thumbnail".format(args.env) + consume_topic = "sandcrawler-{}.pdf-thumbnail-180px-jpg".format(args.env) worker = PersistThumbnailWorker( s3_url=args.s3_url, s3_bucket=args.s3_bucket, s3_access_key=args.s3_access_key, s3_secret_key=args.s3_secret_key, - # TODO: s3_extension=args.s3_extension, + s3_extension=".180px.jpg", + s3_folder="pdf", ) pusher = KafkaJsonPusher( worker=worker, kafka_hosts=args.kafka_hosts, consume_topic=consume_topic, - group="persist-thumbnail", - raw_records=True, + group="persist-pdf-thumbnail", + raw_record=True, batch_size=25, ) pusher.run() |