aboutsummaryrefslogtreecommitdiffstats
path: root/python/enrich_scored_matches.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-09-25 18:01:29 -0700
committerBryan Newbold <bnewbold@archive.org>2019-09-25 18:01:29 -0700
commit716483103dd7fdfe7aab2982c51abae6d3f4271b (patch)
tree31f957048d63e32579d480983dbb922065796cce /python/enrich_scored_matches.py
parent55957b81de5e438f2a245177d624040e74e721b3 (diff)
downloadsandcrawler-716483103dd7fdfe7aab2982c51abae6d3f4271b.tar.gz
sandcrawler-716483103dd7fdfe7aab2982c51abae6d3f4271b.zip
move a bunch of random old scripts to subdir
Diffstat (limited to 'python/enrich_scored_matches.py')
-rwxr-xr-xpython/enrich_scored_matches.py45
1 files changed, 0 insertions, 45 deletions
diff --git a/python/enrich_scored_matches.py b/python/enrich_scored_matches.py
deleted file mode 100755
index 9fe1499..0000000
--- a/python/enrich_scored_matches.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env python3
-"""
-Takes an "joined" TSV input stream:
-
-- sha1
-- dois (JSON list)
-- cdx (JSON object)
- - url
- - dt
- (etc)
-- mimetype
-- size (integer)
-
-And outputs JSON objects that are can be imported into fatcat with the
-"matched" script.
-
-No dependencies (only python3 stdlib)
-"""
-
-import sys
-import json
-import base64
-
-def run():
- for line in sys.stdin:
- line = line.split('\t')
- assert len(line) == 5
- raw_sha1 = line[0].replace('sha1:', '')
- dois = json.loads(line[1])
- cdx = json.loads(line[2])
- mimetype = line[3]
- size = int(line[4])
-
- sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode('ascii').lower()
-
- obj = dict(
- sha1=sha1,
- dois=dois,
- cdx=[dict(url=cdx['url'], dt=cdx['dt'])],
- size=size,
- mimetype=mimetype)
- print(json.dumps(obj))
-
-if __name__=='__main__':
- run()