diff options
| author | Martin Czygan <martin.czygan@gmail.com> | 2021-07-15 03:26:59 +0200 | 
|---|---|---|
| committer | Martin Czygan <martin.czygan@gmail.com> | 2021-07-15 03:26:59 +0200 | 
| commit | 552cb824889e146f4da0df682d96b1047b0e2402 (patch) | |
| tree | 4e00e5cc4c86dc58514ab8a63eb5b571ab054132 | |
| parent | 296b0f203ef5f85b709ec67c2b0bf3f63e9e54c0 (diff) | |
| download | refcat-552cb824889e146f4da0df682d96b1047b0e2402.tar.gz refcat-552cb824889e146f4da0df682d96b1047b0e2402.zip  | |
extra: cdx reshape
| -rw-r--r-- | extra/cdx/cdx_reshape.py | 19 | 
1 files changed, 19 insertions, 0 deletions
diff --git a/extra/cdx/cdx_reshape.py b/extra/cdx/cdx_reshape.py new file mode 100644 index 0000000..4e96134 --- /dev/null +++ b/extra/cdx/cdx_reshape.py @@ -0,0 +1,19 @@ + +# {"summary":{"last":"20131126234003","ok":"20131126234003","delta":0},"numRows":1,"line":"http://128.118.178.162/eps/pe/papers/0501/0501001.pdf"} + +import fileinput +import json + +by_url = {} + +for line in fileinput.input(): +    line = line.strip() +    doc = json.loads(line) +    last = doc.get("summary", {}).get("last", "") +    if not last: +        continue +    by_url[doc["line"]] = doc + + +for k, v in sorted(by_url): +    print("{}\t{}".format(k, v))  | 
