aboutsummaryrefslogtreecommitdiffstats
path: root/sql/backfill/petabox_transform.py
blob: b63891129ef1f77175e72b70946c14f199ef139f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
#!/usr/bin/env python3

import json, sys, os

for l in sys.stdin.readlines():
    l = l.strip()
    if not l:
        continue
    r = json.loads(l)
    if not r['sha1']:
        continue
    sha1hex = r['sha1']
    for url in r['urls']:
        u = url['url']
        if not '//archive.org/' in u:
            continue
        u = u.split('/')
        if u[2] == 'web.archive.org':
            continue
        #print(u)
        assert u[2] == 'archive.org' and u[3] in ('download', 'serve')
        item = u[4]
        path = '/'.join(u[5:])
        print("\t".join([item, path, sha1hex]))