From 552cb824889e146f4da0df682d96b1047b0e2402 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 15 Jul 2021 03:26:59 +0200 Subject: extra: cdx reshape --- extra/cdx/cdx_reshape.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 extra/cdx/cdx_reshape.py diff --git a/extra/cdx/cdx_reshape.py b/extra/cdx/cdx_reshape.py new file mode 100644 index 0000000..4e96134 --- /dev/null +++ b/extra/cdx/cdx_reshape.py @@ -0,0 +1,19 @@ + +# {"summary":{"last":"20131126234003","ok":"20131126234003","delta":0},"numRows":1,"line":"http://128.118.178.162/eps/pe/papers/0501/0501001.pdf"} + +import fileinput +import json + +by_url = {} + +for line in fileinput.input(): + line = line.strip() + doc = json.loads(line) + last = doc.get("summary", {}).get("last", "") + if not last: + continue + by_url[doc["line"]] = doc + + +for k, v in sorted(by_url): + print("{}\t{}".format(k, v)) -- cgit v1.2.3