aboutsummaryrefslogtreecommitdiffstats
path: root/postgresql/petabox_transform.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-08-09 16:51:35 -0700
committerBryan Newbold <bnewbold@archive.org>2019-08-09 16:51:35 -0700
commit9e4657d49dd91f1249042865505d1a9ea8ad2ea6 (patch)
treecaaeae007db5ce630844e20b356d867c9783ebf4 /postgresql/petabox_transform.py
parent9944c674e9ded47431d76d06e60a65eebd510980 (diff)
downloadsandcrawler-9e4657d49dd91f1249042865505d1a9ea8ad2ea6.tar.gz
sandcrawler-9e4657d49dd91f1249042865505d1a9ea8ad2ea6.zip
SQL backfill notes and python scripts
Diffstat (limited to 'postgresql/petabox_transform.py')
-rwxr-xr-xpostgresql/petabox_transform.py24
1 files changed, 24 insertions, 0 deletions
diff --git a/postgresql/petabox_transform.py b/postgresql/petabox_transform.py
new file mode 100755
index 0000000..b638911
--- /dev/null
+++ b/postgresql/petabox_transform.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+
+import json, sys, os
+
+for l in sys.stdin.readlines():
+ l = l.strip()
+ if not l:
+ continue
+ r = json.loads(l)
+ if not r['sha1']:
+ continue
+ sha1hex = r['sha1']
+ for url in r['urls']:
+ u = url['url']
+ if not '//archive.org/' in u:
+ continue
+ u = u.split('/')
+ if u[2] == 'web.archive.org':
+ continue
+ #print(u)
+ assert u[2] == 'archive.org' and u[3] in ('download', 'serve')
+ item = u[4]
+ path = '/'.join(u[5:])
+ print("\t".join([item, path, sha1hex]))