diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-24 18:07:15 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-24 18:07:15 -0700 |
commit | bf0c3399d3a80b32301a3554971a962478614692 (patch) | |
tree | cba3bc6b33c07880a63d023f4129c7dec515889d /extra/partition_dumps/README.md | |
parent | ed78736b5d96a294c89b201c1d992bd30d809434 (diff) | |
download | fatcat-bf0c3399d3a80b32301a3554971a962478614692.tar.gz fatcat-bf0c3399d3a80b32301a3554971a962478614692.zip |
script for partitioning dumps (needs test)
Diffstat (limited to 'extra/partition_dumps/README.md')
-rw-r--r-- | extra/partition_dumps/README.md | 25 |
1 files changed, 25 insertions, 0 deletions
diff --git a/extra/partition_dumps/README.md b/extra/partition_dumps/README.md new file mode 100644 index 00000000..2e26a41b --- /dev/null +++ b/extra/partition_dumps/README.md @@ -0,0 +1,25 @@ + +This script is used to "partition" (split up) a complete JSON dump by some key. +For example, split release dump JSON lines into separate files, one per +journal/container. + +Example parititoning a sample by release type: + + cat release_dump_expanded_sample.json | jq .release_type -r > release_dump_expanded_sample.release_type + cat release_dump_expanded_sample.release_type | sort | uniq -c | sort -nr > release_dump_expanded_sample.release_type.counts + cat release_dump_expanded_sample.json | paste release_dump_expanded_sample.release_type - | sort > out + +More production-y example using ISSN-L: + + # will append otherwise + rm -rf ./partitioned + + # it's a pretty huge sort, will need 300+ GB scratch space? this might not scale. + zcat release_dump_expanded.json.gz | jq .container.issnl -r > release_dump_expanded.issnl + zcat release_dump_expanded.json.gz | paste release_dump_expanded.issnl - | sort | ./partition_script.py + + # for verification/stats + cat release_dump_expanded.issnl | sort | uniq -c | sort -nr > release_dump_expanded.issnl.counts + + # cleanup + rm release_dump_expanded.issnl |