aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts/enrich_scored_matches.py
blob: 9fe1499347b3124c338aa3083e299421d9534c88 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#!/usr/bin/env python3
"""
Takes an "joined" TSV input stream:

- sha1
- dois (JSON list)
- cdx (JSON object)
    - url
    - dt
    (etc)
- mimetype
- size (integer)

And outputs JSON objects that are can be imported into fatcat with the
"matched" script.

No dependencies (only python3 stdlib)
"""

import sys
import json
import base64

def run():
    for line in sys.stdin:
        line = line.split('\t')
        assert len(line) == 5
        raw_sha1 = line[0].replace('sha1:', '')
        dois = json.loads(line[1])
        cdx = json.loads(line[2])
        mimetype = line[3]
        size = int(line[4])

        sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode('ascii').lower()

        obj = dict(
            sha1=sha1,
            dois=dois,
            cdx=[dict(url=cdx['url'], dt=cdx['dt'])],
            size=size,
            mimetype=mimetype)
        print(json.dumps(obj))

if __name__=='__main__':
    run()