aboutsummaryrefslogtreecommitdiffstats
path: root/extra/cdx/cdx_reshape.py
blob: 28b6fc08e61d0e4a78845d8867dea74758395712 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

# {"summary":{"last":"20131126234003","ok":"20131126234003","delta":0},"numRows":1,"line":"http://128.118.178.162/eps/pe/papers/0501/0501001.pdf"}

import fileinput
import json

by_url = {}

for line in fileinput.input():
    line = line.strip()
    doc = json.loads(line)
    if doc.get("numRows") == 0:
        continue
    by_url[doc["line"]] = doc


for k, v in sorted(by_url.items()):
    print("{}\t{}".format(k, json.dumps(v)))