aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts/enrich_scored_matches.py
blob: 44c091cd34ded3d45cfd0c785d72ed24830778a1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/usr/bin/env python3
"""
Takes an "joined" TSV input stream:

- sha1
- dois (JSON list)
- cdx (JSON object)
    - url
    - dt
    (etc)
- mimetype
- size (integer)

And outputs JSON objects that are can be imported into fatcat with the
"matched" script.

No dependencies (only python3 stdlib)
"""

import base64
import json
import sys


def run():
    for line in sys.stdin:
        line = line.split("\t")
        assert len(line) == 5
        raw_sha1 = line[0].replace("sha1:", "")
        dois = json.loads(line[1])
        cdx = json.loads(line[2])
        mimetype = line[3]
        size = int(line[4])

        sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode("ascii").lower()

        obj = dict(
            sha1=sha1,
            dois=dois,
            cdx=[dict(url=cdx["url"], dt=cdx["dt"])],
            size=size,
            mimetype=mimetype,
        )
        print(json.dumps(obj))


if __name__ == "__main__":
    run()