From 3dda415bd36653b7af3eac768ef28dbe8cfc49a4 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 31 Mar 2021 22:59:13 +0200 Subject: cleanup tasks --- python/refcat/tasks.py | 51 -------------------------------------------------- 1 file changed, 51 deletions(-) (limited to 'python') diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 3358f41..430181a 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -1097,57 +1097,6 @@ class RefsFatcatClusters(Refcat): return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) -class RefsFatcatClusterVerify(Refcat): - """ - Verification; took 50h+, ran manually, documenting here (not running, - fuzzycat needs pipenv currently). - - Output like: - - https://fatcat.wiki/release/a6xucdggk5h7bcmbxidvqt7hxe https://fatcat.wiki/release/amnpvj5ma5dxlc2a3x2bm2zbnq Status.STRONG Reason.SLUG_TITLE_AUTHOR_MATCH - https://fatcat.wiki/release/vyppsuuh2bhapdwcqzln5momta https://fatcat.wiki/release/6gd53yl5yzakrlr72xeojamchi Status.DIFFERENT Reason.CONTRIB_INTERSECTION_EMPTY - https://fatcat.wiki/release/hazousx6wna5bn5e27s5mrljzq https://fatcat.wiki/release/iajt2xam5nbc3ichkxxuhqaqw4 Status.DIFFERENT Reason.YEAR - - XXX: can we get rid of this? - """ - def requires(self): - return RefsFatcatClusters() - - def run(self): - # As ran in 2021-02-12 on aitio - # - # $ time zstdcat -T0 - # sha1-ef1756a5856085807742966f48d95b4cb00299a0.json.zst | parallel - # --tmpdir /bigger/tmp --blocksize 4M --pipe -j 16 'python -m fuzzycat - # verify_ref' > cluster_ref_verify.tsv - print(""" zstdcat -T0 {input} | parallel --tmpdir {tmpdir} - --blocksize 4M --pipe -j 16 'python -m fuzzycat verify_ref' | - zstd -c -T0 > {output} """.format(input=self.input().path, tmpdir=self.tmpdir, output=self.output().path)) - - def output(self): - return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) - - -class BiblioRefFuzzy(Refcat): - """ - A biblioref document from fuzzy match results. 12min. - """ - def requires(self): - return RefsFatcatClusterVerify() - - def run(self): - output = shellout(""" - zstdcat -T0 {input} | - LC_ALL=C grep -E "(EXACT|STRONG)" | - LC_ALL=C grep -E -v "(Reason.DOI|Reason.WORK_ID)" | - skate-biblioref | - zstd -T0 -c9 > {output} - """, - input=self.input().path) - luigi.LocalTarget(output).move(self.output().path) - - def output(self): - return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) class BiblioRefFromJoin(Refcat): -- cgit v1.2.3