aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-07-15 03:26:59 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-07-15 03:26:59 +0200
commit552cb824889e146f4da0df682d96b1047b0e2402 (patch)
tree4e00e5cc4c86dc58514ab8a63eb5b571ab054132
parent296b0f203ef5f85b709ec67c2b0bf3f63e9e54c0 (diff)
downloadrefcat-552cb824889e146f4da0df682d96b1047b0e2402.tar.gz
refcat-552cb824889e146f4da0df682d96b1047b0e2402.zip
extra: cdx reshape
-rw-r--r--extra/cdx/cdx_reshape.py19
1 files changed, 19 insertions, 0 deletions
diff --git a/extra/cdx/cdx_reshape.py b/extra/cdx/cdx_reshape.py
new file mode 100644
index 0000000..4e96134
--- /dev/null
+++ b/extra/cdx/cdx_reshape.py
@@ -0,0 +1,19 @@
+
+# {"summary":{"last":"20131126234003","ok":"20131126234003","delta":0},"numRows":1,"line":"http://128.118.178.162/eps/pe/papers/0501/0501001.pdf"}
+
+import fileinput
+import json
+
+by_url = {}
+
+for line in fileinput.input():
+ line = line.strip()
+ doc = json.loads(line)
+ last = doc.get("summary", {}).get("last", "")
+ if not last:
+ continue
+ by_url[doc["line"]] = doc
+
+
+for k, v in sorted(by_url):
+ print("{}\t{}".format(k, v))