From 6443fd5756779e2cca986e73e5008a073ef96452 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 27 Jul 2020 15:54:55 -0700 Subject: json: exclude None in output, and sort keys These are both size/performance enhancements. Not including 'None' values will reduce document sizes on-disk and over network, particularly for intermediate objects. Sorting by key should improve compression ratios across multiple documents, both on-disk (gzip) and in elasticsearch itself: https://www.elastic.co/guide/en/elasticsearch/reference/current/tune-for-disk-usage.html#_put_fields_in_the_same_order_in_documents --- fatcat_scholar/sim_pipeline.py | 2 +- fatcat_scholar/transform.py | 2 +- fatcat_scholar/work_pipeline.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fatcat_scholar') diff --git a/fatcat_scholar/sim_pipeline.py b/fatcat_scholar/sim_pipeline.py index f99471f..95e5cad 100644 --- a/fatcat_scholar/sim_pipeline.py +++ b/fatcat_scholar/sim_pipeline.py @@ -149,7 +149,7 @@ class SimPipeline: issue_item_metadata=full_issue["issue_item_metadata"], ), ) - print(bundle.json()) + print(bundle.json(exclude_none=True, sort_keys=True)) count += 1 if limit is not None and count >= limit: break diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index f6e2c46..7afbce5 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -364,7 +364,7 @@ def run_transform(infile: Sequence) -> None: es_doc = transform_heavy(heavy) if not es_doc: continue - print(es_doc.json()) + print(es_doc.json(exclude_none=True, sort_keys=True)) def main() -> None: diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py index accbc1d..b2ceaf8 100644 --- a/fatcat_scholar/work_pipeline.py +++ b/fatcat_scholar/work_pipeline.py @@ -355,7 +355,7 @@ class WorkPipeline: continue if batch: ib = self.process_release_list(batch) - print(ib.json()) + print(ib.json(exclude_none=True)) batch_work_id = None batch = [ release, @@ -364,7 +364,7 @@ class WorkPipeline: if batch: ib = self.process_release_list(batch) - print(ib.json()) + print(ib.json(exclude_none=True, sort_keys=True)) def main() -> None: -- cgit v1.2.3