diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-09-25 18:01:29 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-09-25 18:01:29 -0700 |
commit | 716483103dd7fdfe7aab2982c51abae6d3f4271b (patch) | |
tree | 31f957048d63e32579d480983dbb922065796cce /python/enrich_scored_matches.py | |
parent | 55957b81de5e438f2a245177d624040e74e721b3 (diff) | |
download | sandcrawler-716483103dd7fdfe7aab2982c51abae6d3f4271b.tar.gz sandcrawler-716483103dd7fdfe7aab2982c51abae6d3f4271b.zip |
move a bunch of random old scripts to subdir
Diffstat (limited to 'python/enrich_scored_matches.py')
-rwxr-xr-x | python/enrich_scored_matches.py | 45 |
1 files changed, 0 insertions, 45 deletions
diff --git a/python/enrich_scored_matches.py b/python/enrich_scored_matches.py deleted file mode 100755 index 9fe1499..0000000 --- a/python/enrich_scored_matches.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python3 -""" -Takes an "joined" TSV input stream: - -- sha1 -- dois (JSON list) -- cdx (JSON object) - - url - - dt - (etc) -- mimetype -- size (integer) - -And outputs JSON objects that are can be imported into fatcat with the -"matched" script. - -No dependencies (only python3 stdlib) -""" - -import sys -import json -import base64 - -def run(): - for line in sys.stdin: - line = line.split('\t') - assert len(line) == 5 - raw_sha1 = line[0].replace('sha1:', '') - dois = json.loads(line[1]) - cdx = json.loads(line[2]) - mimetype = line[3] - size = int(line[4]) - - sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode('ascii').lower() - - obj = dict( - sha1=sha1, - dois=dois, - cdx=[dict(url=cdx['url'], dt=cdx['dt'])], - size=size, - mimetype=mimetype) - print(json.dumps(obj)) - -if __name__=='__main__': - run() |