aboutsummaryrefslogtreecommitdiffstats
path: root/extra/partition_dumps/README.md
blob: 2e26a41b5d51504a228fc3ff6ba1880d77182816 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25

This script is used to "partition" (split up) a complete JSON dump by some key.
For example, split release dump JSON lines into separate files, one per
journal/container.

Example parititoning a sample by release type:

    cat release_dump_expanded_sample.json | jq .release_type -r > release_dump_expanded_sample.release_type
    cat release_dump_expanded_sample.release_type | sort | uniq -c | sort -nr > release_dump_expanded_sample.release_type.counts
    cat release_dump_expanded_sample.json | paste release_dump_expanded_sample.release_type - | sort > out

More production-y example using ISSN-L:

    # will append otherwise
    rm -rf ./partitioned

    # it's a pretty huge sort, will need 300+ GB scratch space? this might not scale.
    zcat release_dump_expanded.json.gz | jq .container.issnl -r > release_dump_expanded.issnl
    zcat release_dump_expanded.json.gz | paste release_dump_expanded.issnl - | sort  | ./partition_script.py

    # for verification/stats
    cat release_dump_expanded.issnl | sort | uniq -c | sort -nr > release_dump_expanded.issnl.counts
    
    # cleanup
    rm release_dump_expanded.issnl