diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-09-23 23:00:23 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-09-23 23:00:23 -0700 |
commit | b362abd38ad4a6624bc056c58eb90ae235c63f00 (patch) | |
tree | 026556fe548e28be1556c24b6ee865cb276755ca /postgrest/backfill/petabox_transform.py | |
parent | b438f52dbb7578c9a5c2153bc4ba50e33fdae7c3 (diff) | |
download | sandcrawler-b362abd38ad4a6624bc056c58eb90ae235c63f00.tar.gz sandcrawler-b362abd38ad4a6624bc056c58eb90ae235c63f00.zip |
rename postgrest directory sql
Diffstat (limited to 'postgrest/backfill/petabox_transform.py')
-rwxr-xr-x | postgrest/backfill/petabox_transform.py | 24 |
1 files changed, 0 insertions, 24 deletions
diff --git a/postgrest/backfill/petabox_transform.py b/postgrest/backfill/petabox_transform.py deleted file mode 100755 index b638911..0000000 --- a/postgrest/backfill/petabox_transform.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python3 - -import json, sys, os - -for l in sys.stdin.readlines(): - l = l.strip() - if not l: - continue - r = json.loads(l) - if not r['sha1']: - continue - sha1hex = r['sha1'] - for url in r['urls']: - u = url['url'] - if not '//archive.org/' in u: - continue - u = u.split('/') - if u[2] == 'web.archive.org': - continue - #print(u) - assert u[2] == 'archive.org' and u[3] in ('download', 'serve') - item = u[4] - path = '/'.join(u[5:]) - print("\t".join([item, path, sha1hex])) |