From 710a0feab36f83eef21885ee7c23e5841cae1e87 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 14 Sep 2018 19:18:14 -0700 Subject: match and enrich notes+script --- notes/match_filter_enrich.txt | 19 +++++++++++++++++ python/enrich_scored_matches.py | 45 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 notes/match_filter_enrich.txt create mode 100755 python/enrich_scored_matches.py diff --git a/notes/match_filter_enrich.txt b/notes/match_filter_enrich.txt new file mode 100644 index 0000000..e555d46 --- /dev/null +++ b/notes/match_filter_enrich.txt @@ -0,0 +1,19 @@ + +This could all be a single scalding job eventually. + +First, run matchcrossref and dumpfilemeta, and copy the output down to an SSD +somewhere. + + bnewbold@ia601101$ zcat 2018-09-14-0559.05-dumpfilemeta.tsv.gz | wc -l + 30728100 + +Reduce down the scored matches to just {sha1, dois}, sorted: + + zcat 2018-08-27-2352.17-matchcrossref.tsv.gz | ./filter_scored_matches.py | pv -l | sort > 2018-08-27-2352.17-matchcrossref.filtered.tsv + # 5.79M 0:18:54 [5.11k/s] + +Join/merge the output: + + zcat 2018-09-14-0559.05-dumpfilemeta.tsv.gz | LC_ALL=C join -t$'\t' 2018-08-27-2352.17-matchcrossref.filtered.tsv - | pv -l | enrich_scored_matches.py | gzip > 2018-08-27-2352.17-matchcrossref.insertable.json.gz + # 5.79M 0:09:09 [10.5k/s] + diff --git a/python/enrich_scored_matches.py b/python/enrich_scored_matches.py new file mode 100755 index 0000000..9fe1499 --- /dev/null +++ b/python/enrich_scored_matches.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +""" +Takes an "joined" TSV input stream: + +- sha1 +- dois (JSON list) +- cdx (JSON object) + - url + - dt + (etc) +- mimetype +- size (integer) + +And outputs JSON objects that are can be imported into fatcat with the +"matched" script. + +No dependencies (only python3 stdlib) +""" + +import sys +import json +import base64 + +def run(): + for line in sys.stdin: + line = line.split('\t') + assert len(line) == 5 + raw_sha1 = line[0].replace('sha1:', '') + dois = json.loads(line[1]) + cdx = json.loads(line[2]) + mimetype = line[3] + size = int(line[4]) + + sha1 = base64.b16encode(base64.b32decode(raw_sha1)).decode('ascii').lower() + + obj = dict( + sha1=sha1, + dois=dois, + cdx=[dict(url=cdx['url'], dt=cdx['dt'])], + size=size, + mimetype=mimetype) + print(json.dumps(obj)) + +if __name__=='__main__': + run() -- cgit v1.2.3