diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-08-09 16:51:35 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-08-09 16:51:35 -0700 |
commit | 9e4657d49dd91f1249042865505d1a9ea8ad2ea6 (patch) | |
tree | caaeae007db5ce630844e20b356d867c9783ebf4 /postgresql/petabox_transform.py | |
parent | 9944c674e9ded47431d76d06e60a65eebd510980 (diff) | |
download | sandcrawler-9e4657d49dd91f1249042865505d1a9ea8ad2ea6.tar.gz sandcrawler-9e4657d49dd91f1249042865505d1a9ea8ad2ea6.zip |
SQL backfill notes and python scripts
Diffstat (limited to 'postgresql/petabox_transform.py')
-rwxr-xr-x | postgresql/petabox_transform.py | 24 |
1 files changed, 24 insertions, 0 deletions
diff --git a/postgresql/petabox_transform.py b/postgresql/petabox_transform.py new file mode 100755 index 0000000..b638911 --- /dev/null +++ b/postgresql/petabox_transform.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 + +import json, sys, os + +for l in sys.stdin.readlines(): + l = l.strip() + if not l: + continue + r = json.loads(l) + if not r['sha1']: + continue + sha1hex = r['sha1'] + for url in r['urls']: + u = url['url'] + if not '//archive.org/' in u: + continue + u = u.split('/') + if u[2] == 'web.archive.org': + continue + #print(u) + assert u[2] == 'archive.org' and u[3] in ('download', 'serve') + item = u[4] + path = '/'.join(u[5:]) + print("\t".join([item, path, sha1hex])) |