aboutsummaryrefslogtreecommitdiffstats
path: root/postgrest/backfill/petabox_transform.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-09-23 23:00:23 -0700
committerBryan Newbold <bnewbold@archive.org>2019-09-23 23:00:23 -0700
commitb362abd38ad4a6624bc056c58eb90ae235c63f00 (patch)
tree026556fe548e28be1556c24b6ee865cb276755ca /postgrest/backfill/petabox_transform.py
parentb438f52dbb7578c9a5c2153bc4ba50e33fdae7c3 (diff)
downloadsandcrawler-b362abd38ad4a6624bc056c58eb90ae235c63f00.tar.gz
sandcrawler-b362abd38ad4a6624bc056c58eb90ae235c63f00.zip
rename postgrest directory sql
Diffstat (limited to 'postgrest/backfill/petabox_transform.py')
-rwxr-xr-xpostgrest/backfill/petabox_transform.py24
1 files changed, 0 insertions, 24 deletions
diff --git a/postgrest/backfill/petabox_transform.py b/postgrest/backfill/petabox_transform.py
deleted file mode 100755
index b638911..0000000
--- a/postgrest/backfill/petabox_transform.py
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/usr/bin/env python3
-
-import json, sys, os
-
-for l in sys.stdin.readlines():
- l = l.strip()
- if not l:
- continue
- r = json.loads(l)
- if not r['sha1']:
- continue
- sha1hex = r['sha1']
- for url in r['urls']:
- u = url['url']
- if not '//archive.org/' in u:
- continue
- u = u.split('/')
- if u[2] == 'web.archive.org':
- continue
- #print(u)
- assert u[2] == 'archive.org' and u[3] in ('download', 'serve')
- item = u[4]
- path = '/'.join(u[5:])
- print("\t".join([item, path, sha1hex]))