aboutsummaryrefslogtreecommitdiffstats
path: root/extra/cdx/cdx_reshape.py
blob: 6b3d6e533c9af36610bbecdfec4af29288d135f2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19

# {"summary":{"last":"20131126234003","ok":"20131126234003","delta":0},"numRows":1,"line":"http://128.118.178.162/eps/pe/papers/0501/0501001.pdf"}

import fileinput
import json

by_url = {}

for line in fileinput.input():
    line = line.strip()
    doc = json.loads(line)
    last = doc.get("summary", {}).get("last", "")
    if not last:
        continue
    by_url[doc["line"]] = doc


for k, v in sorted(by_url.items()):
    print("{}\t{}".format(k, json.dumps(v)))