diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-24 18:07:15 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-24 18:07:15 -0700 |
commit | bf0c3399d3a80b32301a3554971a962478614692 (patch) | |
tree | cba3bc6b33c07880a63d023f4129c7dec515889d /extra/partition_dumps/partition_script.py | |
parent | ed78736b5d96a294c89b201c1d992bd30d809434 (diff) | |
download | fatcat-bf0c3399d3a80b32301a3554971a962478614692.tar.gz fatcat-bf0c3399d3a80b32301a3554971a962478614692.zip |
script for partitioning dumps (needs test)
Diffstat (limited to 'extra/partition_dumps/partition_script.py')
-rwxr-xr-x | extra/partition_dumps/partition_script.py | 42 |
1 files changed, 42 insertions, 0 deletions
diff --git a/extra/partition_dumps/partition_script.py b/extra/partition_dumps/partition_script.py new file mode 100755 index 00000000..edcc7e60 --- /dev/null +++ b/extra/partition_dumps/partition_script.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +""" +Reads key-prefixed JSON lines from stdin, and writes out to gzipped files under +./partitioned/. + +Skips empty keys and "null" (to handle a jq common-case). + +Eg, for tab-separated input: + + something {"a": 1} + something2 {"b": 2} + +Will write to ./partitioned/something.json.gz: + + {"a": 1} + +(and "b" object to ./partitioned/something2.json.gz) +""" + +import os, sys, gzip + +def run(): + last_prefix = None + f = None + os.makedirs('partitioned', exist_ok=True) + + for line in sys.stdin: + (prefix, obj) = line.strip().split('\t')[:2] + if not prefix or prefix == "null": + continue + if prefix != last_prefix: + if f: + f.close() + f = gzip.GzipFile('partitioned/{}.json.gz'.format(prefix), 'a') + f.write(obj.encode('utf-8')) + f.write(b"\n") + last_prefix = prefix + if f: + f.close() + +if __name__=="__main__": + run() |