aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts/enrich_scored_matches.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-09-25 18:01:29 -0700
committerBryan Newbold <bnewbold@archive.org>2019-09-25 18:01:29 -0700
commit716483103dd7fdfe7aab2982c51abae6d3f4271b (patch)
tree31f957048d63e32579d480983dbb922065796cce /python/scripts/enrich_scored_matches.py
parent55957b81de5e438f2a245177d624040e74e721b3 (diff)
downloadsandcrawler-716483103dd7fdfe7aab2982c51abae6d3f4271b.tar.gz
sandcrawler-716483103dd7fdfe7aab2982c51abae6d3f4271b.zip
move a bunch of random old scripts to subdir
Diffstat (limited to 'python/scripts/enrich_scored_matches.py')
-rwxr-xr-xpython/scripts/enrich_scored_matches.py45
1 files changed, 45 insertions, 0 deletions
diff --git a/python/scripts/enrich_scored_matches.py b/python/scripts/enrich_scored_matches.py
new file mode 100755
index 0000000..9fe1499
--- /dev/null
+++ b/python/scripts/enrich_scored_matches.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+"""
+Takes an "joined" TSV input stream:
+
+- sha1
+- dois (JSON list)
+- cdx (JSON object)
+ - url
+ - dt
+ (etc)
+- mimetype
+- size (integer)
+
+And outputs JSON objects that are can be imported into fatcat with the
+"matched" script.
+
+No dependencies (only python3 stdlib)
+"""
+
+import sys
+import json
+import base64
+
+def run():
+ for line in sys.stdin:
+ line = line.split('\t')
+ assert len(line) == 5
+ raw_sha1 = line[0].replace('sha1:', '')
+ dois = json.loads(line[1])
+ cdx = json.loads(line[2])
+ mimetype = line[3]
+ size = int(line[4])
+
+ sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode('ascii').lower()
+
+ obj = dict(
+ sha1=sha1,
+ dois=dois,
+ cdx=[dict(url=cdx['url'], dt=cdx['dt'])],
+ size=size,
+ mimetype=mimetype)
+ print(json.dumps(obj))
+
+if __name__=='__main__':
+ run()