diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-07-15 03:26:59 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-07-15 03:26:59 +0200 |
commit | 552cb824889e146f4da0df682d96b1047b0e2402 (patch) | |
tree | 4e00e5cc4c86dc58514ab8a63eb5b571ab054132 | |
parent | 296b0f203ef5f85b709ec67c2b0bf3f63e9e54c0 (diff) | |
download | refcat-552cb824889e146f4da0df682d96b1047b0e2402.tar.gz refcat-552cb824889e146f4da0df682d96b1047b0e2402.zip |
extra: cdx reshape
-rw-r--r-- | extra/cdx/cdx_reshape.py | 19 |
1 files changed, 19 insertions, 0 deletions
diff --git a/extra/cdx/cdx_reshape.py b/extra/cdx/cdx_reshape.py new file mode 100644 index 0000000..4e96134 --- /dev/null +++ b/extra/cdx/cdx_reshape.py @@ -0,0 +1,19 @@ + +# {"summary":{"last":"20131126234003","ok":"20131126234003","delta":0},"numRows":1,"line":"http://128.118.178.162/eps/pe/papers/0501/0501001.pdf"} + +import fileinput +import json + +by_url = {} + +for line in fileinput.input(): + line = line.strip() + doc = json.loads(line) + last = doc.get("summary", {}).get("last", "") + if not last: + continue + by_url[doc["line"]] = doc + + +for k, v in sorted(by_url): + print("{}\t{}".format(k, v)) |