aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/refcat/tasks.py51
1 files changed, 0 insertions, 51 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index 3358f41..430181a 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -1097,57 +1097,6 @@ class RefsFatcatClusters(Refcat):
return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
-class RefsFatcatClusterVerify(Refcat):
- """
- Verification; took 50h+, ran manually, documenting here (not running,
- fuzzycat needs pipenv currently).
-
- Output like:
-
- https://fatcat.wiki/release/a6xucdggk5h7bcmbxidvqt7hxe https://fatcat.wiki/release/amnpvj5ma5dxlc2a3x2bm2zbnq Status.STRONG Reason.SLUG_TITLE_AUTHOR_MATCH
- https://fatcat.wiki/release/vyppsuuh2bhapdwcqzln5momta https://fatcat.wiki/release/6gd53yl5yzakrlr72xeojamchi Status.DIFFERENT Reason.CONTRIB_INTERSECTION_EMPTY
- https://fatcat.wiki/release/hazousx6wna5bn5e27s5mrljzq https://fatcat.wiki/release/iajt2xam5nbc3ichkxxuhqaqw4 Status.DIFFERENT Reason.YEAR
-
- XXX: can we get rid of this?
- """
- def requires(self):
- return RefsFatcatClusters()
-
- def run(self):
- # As ran in 2021-02-12 on aitio
- #
- # $ time zstdcat -T0
- # sha1-ef1756a5856085807742966f48d95b4cb00299a0.json.zst | parallel
- # --tmpdir /bigger/tmp --blocksize 4M --pipe -j 16 'python -m fuzzycat
- # verify_ref' > cluster_ref_verify.tsv
- print(""" zstdcat -T0 {input} | parallel --tmpdir {tmpdir}
- --blocksize 4M --pipe -j 16 'python -m fuzzycat verify_ref' |
- zstd -c -T0 > {output} """.format(input=self.input().path, tmpdir=self.tmpdir, output=self.output().path))
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
-
-
-class BiblioRefFuzzy(Refcat):
- """
- A biblioref document from fuzzy match results. 12min.
- """
- def requires(self):
- return RefsFatcatClusterVerify()
-
- def run(self):
- output = shellout("""
- zstdcat -T0 {input} |
- LC_ALL=C grep -E "(EXACT|STRONG)" |
- LC_ALL=C grep -E -v "(Reason.DOI|Reason.WORK_ID)" |
- skate-biblioref |
- zstd -T0 -c9 > {output}
- """,
- input=self.input().path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
class BiblioRefFromJoin(Refcat):