blob: 54c3d5feea52ca0e3e5b859561d28e4df7145d1d (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
|
#!/usr/bin/env python3
"""
Takes an "joined" TSV input stream:
- sha1
- dois (JSON list)
- cdx (JSON object)
- url
- dt
(etc)
- mimetype
- size (integer)
And outputs JSON objects that are can be imported into fatcat with the
"matched" script.
No dependencies (only python3 stdlib)
"""
import base64
import json
import sys
def run():
for line in sys.stdin:
line = line.split('\t')
assert len(line) == 5
raw_sha1 = line[0].replace('sha1:', '')
dois = json.loads(line[1])
cdx = json.loads(line[2])
mimetype = line[3]
size = int(line[4])
sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode('ascii').lower()
obj = dict(sha1=sha1,
dois=dois,
cdx=[dict(url=cdx['url'], dt=cdx['dt'])],
size=size,
mimetype=mimetype)
print(json.dumps(obj))
if __name__ == '__main__':
run()
|