aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/work_pipeline.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-07-27 15:54:55 -0700
committerBryan Newbold <bnewbold@archive.org>2020-07-27 15:54:57 -0700
commit6443fd5756779e2cca986e73e5008a073ef96452 (patch)
treedeaac6ff17a9dbafcd2142c1b2a4679691403634 /fatcat_scholar/work_pipeline.py
parent0234f6188b6f10dad96e2ea8dc67d2674c26c02d (diff)
downloadfatcat-scholar-6443fd5756779e2cca986e73e5008a073ef96452.tar.gz
fatcat-scholar-6443fd5756779e2cca986e73e5008a073ef96452.zip
json: exclude None in output, and sort keys
These are both size/performance enhancements. Not including 'None' values will reduce document sizes on-disk and over network, particularly for intermediate objects. Sorting by key should improve compression ratios across multiple documents, both on-disk (gzip) and in elasticsearch itself: https://www.elastic.co/guide/en/elasticsearch/reference/current/tune-for-disk-usage.html#_put_fields_in_the_same_order_in_documents
Diffstat (limited to 'fatcat_scholar/work_pipeline.py')
-rw-r--r--fatcat_scholar/work_pipeline.py4
1 files changed, 2 insertions, 2 deletions
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py
index accbc1d..b2ceaf8 100644
--- a/fatcat_scholar/work_pipeline.py
+++ b/fatcat_scholar/work_pipeline.py
@@ -355,7 +355,7 @@ class WorkPipeline:
continue
if batch:
ib = self.process_release_list(batch)
- print(ib.json())
+ print(ib.json(exclude_none=True))
batch_work_id = None
batch = [
release,
@@ -364,7 +364,7 @@ class WorkPipeline:
if batch:
ib = self.process_release_list(batch)
- print(ib.json())
+ print(ib.json(exclude_none=True, sort_keys=True))
def main() -> None: