aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-09-14 19:18:14 -0700
committerBryan Newbold <bnewbold@archive.org>2018-09-14 19:18:14 -0700
commit710a0feab36f83eef21885ee7c23e5841cae1e87 (patch)
tree945d28de7c3c66ec9a3181fe9ce3389f1f12ff89
parent8e67baf622daa21ceca1b7cbf13f5461d9d8029a (diff)
downloadsandcrawler-710a0feab36f83eef21885ee7c23e5841cae1e87.tar.gz
sandcrawler-710a0feab36f83eef21885ee7c23e5841cae1e87.zip
match and enrich notes+script
-rw-r--r--notes/match_filter_enrich.txt19
-rwxr-xr-xpython/enrich_scored_matches.py45
2 files changed, 64 insertions, 0 deletions
diff --git a/notes/match_filter_enrich.txt b/notes/match_filter_enrich.txt
new file mode 100644
index 0000000..e555d46
--- /dev/null
+++ b/notes/match_filter_enrich.txt
@@ -0,0 +1,19 @@
+
+This could all be a single scalding job eventually.
+
+First, run matchcrossref and dumpfilemeta, and copy the output down to an SSD
+somewhere.
+
+ bnewbold@ia601101$ zcat 2018-09-14-0559.05-dumpfilemeta.tsv.gz | wc -l
+ 30728100
+
+Reduce down the scored matches to just {sha1, dois}, sorted:
+
+ zcat 2018-08-27-2352.17-matchcrossref.tsv.gz | ./filter_scored_matches.py | pv -l | sort > 2018-08-27-2352.17-matchcrossref.filtered.tsv
+ # 5.79M 0:18:54 [5.11k/s]
+
+Join/merge the output:
+
+ zcat 2018-09-14-0559.05-dumpfilemeta.tsv.gz | LC_ALL=C join -t$'\t' 2018-08-27-2352.17-matchcrossref.filtered.tsv - | pv -l | enrich_scored_matches.py | gzip > 2018-08-27-2352.17-matchcrossref.insertable.json.gz
+ # 5.79M 0:09:09 [10.5k/s]
+
diff --git a/python/enrich_scored_matches.py b/python/enrich_scored_matches.py
new file mode 100755
index 0000000..9fe1499
--- /dev/null
+++ b/python/enrich_scored_matches.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+"""
+Takes an "joined" TSV input stream:
+
+- sha1
+- dois (JSON list)
+- cdx (JSON object)
+ - url
+ - dt
+ (etc)
+- mimetype
+- size (integer)
+
+And outputs JSON objects that are can be imported into fatcat with the
+"matched" script.
+
+No dependencies (only python3 stdlib)
+"""
+
+import sys
+import json
+import base64
+
+def run():
+ for line in sys.stdin:
+ line = line.split('\t')
+ assert len(line) == 5
+ raw_sha1 = line[0].replace('sha1:', '')
+ dois = json.loads(line[1])
+ cdx = json.loads(line[2])
+ mimetype = line[3]
+ size = int(line[4])
+
+ sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode('ascii').lower()
+
+ obj = dict(
+ sha1=sha1,
+ dois=dois,
+ cdx=[dict(url=cdx['url'], dt=cdx['dt'])],
+ size=size,
+ mimetype=mimetype)
+ print(json.dumps(obj))
+
+if __name__=='__main__':
+ run()