diff options
-rw-r--r-- | extra/cdx/cdx_reshape.py | 19 |
1 files changed, 19 insertions, 0 deletions
diff --git a/extra/cdx/cdx_reshape.py b/extra/cdx/cdx_reshape.py new file mode 100644 index 0000000..4e96134 --- /dev/null +++ b/extra/cdx/cdx_reshape.py @@ -0,0 +1,19 @@ + +# {"summary":{"last":"20131126234003","ok":"20131126234003","delta":0},"numRows":1,"line":"http://128.118.178.162/eps/pe/papers/0501/0501001.pdf"} + +import fileinput +import json + +by_url = {} + +for line in fileinput.input(): + line = line.strip() + doc = json.loads(line) + last = doc.get("summary", {}).get("last", "") + if not last: + continue + by_url[doc["line"]] = doc + + +for k, v in sorted(by_url): + print("{}\t{}".format(k, v)) |