aboutsummaryrefslogtreecommitdiffstats
path: root/extra/cdx/cdx_reshape.py
diff options
context:
space:
mode:
Diffstat (limited to 'extra/cdx/cdx_reshape.py')
-rw-r--r--extra/cdx/cdx_reshape.py19
1 files changed, 19 insertions, 0 deletions
diff --git a/extra/cdx/cdx_reshape.py b/extra/cdx/cdx_reshape.py
new file mode 100644
index 0000000..4e96134
--- /dev/null
+++ b/extra/cdx/cdx_reshape.py
@@ -0,0 +1,19 @@
+
+# {"summary":{"last":"20131126234003","ok":"20131126234003","delta":0},"numRows":1,"line":"http://128.118.178.162/eps/pe/papers/0501/0501001.pdf"}
+
+import fileinput
+import json
+
+by_url = {}
+
+for line in fileinput.input():
+ line = line.strip()
+ doc = json.loads(line)
+ last = doc.get("summary", {}).get("last", "")
+ if not last:
+ continue
+ by_url[doc["line"]] = doc
+
+
+for k, v in sorted(by_url):
+ print("{}\t{}".format(k, v))