aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-09-14 19:18:14 -0700
committerBryan Newbold <bnewbold@archive.org>2018-09-14 19:18:14 -0700
commit710a0feab36f83eef21885ee7c23e5841cae1e87 (patch)
tree945d28de7c3c66ec9a3181fe9ce3389f1f12ff89 /python
parent8e67baf622daa21ceca1b7cbf13f5461d9d8029a (diff)
downloadsandcrawler-710a0feab36f83eef21885ee7c23e5841cae1e87.tar.gz
sandcrawler-710a0feab36f83eef21885ee7c23e5841cae1e87.zip
match and enrich notes+script
Diffstat (limited to 'python')
-rwxr-xr-xpython/enrich_scored_matches.py45
1 files changed, 45 insertions, 0 deletions
diff --git a/python/enrich_scored_matches.py b/python/enrich_scored_matches.py
new file mode 100755
index 0000000..9fe1499
--- /dev/null
+++ b/python/enrich_scored_matches.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+"""
+Takes an "joined" TSV input stream:
+
+- sha1
+- dois (JSON list)
+- cdx (JSON object)
+ - url
+ - dt
+ (etc)
+- mimetype
+- size (integer)
+
+And outputs JSON objects that are can be imported into fatcat with the
+"matched" script.
+
+No dependencies (only python3 stdlib)
+"""
+
+import sys
+import json
+import base64
+
+def run():
+ for line in sys.stdin:
+ line = line.split('\t')
+ assert len(line) == 5
+ raw_sha1 = line[0].replace('sha1:', '')
+ dois = json.loads(line[1])
+ cdx = json.loads(line[2])
+ mimetype = line[3]
+ size = int(line[4])
+
+ sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode('ascii').lower()
+
+ obj = dict(
+ sha1=sha1,
+ dois=dois,
+ cdx=[dict(url=cdx['url'], dt=cdx['dt'])],
+ size=size,
+ mimetype=mimetype)
+ print(json.dumps(obj))
+
+if __name__=='__main__':
+ run()