From c8b2a4c63064c0096cabc6181974c3f0b46dcafb Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 5 Aug 2021 01:35:34 +0200 Subject: tasks: basic cleanup e.g. filter out docs with too many dots: https://qa.fatcat.wiki/release/gu2khq347rgonfttti7xy3vwiu/refs-out --- python/refcat/tasks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 43eb4b2..75a3daa 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -337,7 +337,7 @@ class OpenCitations(luigi.ExternalTask, Refcat): class RefsWithUnstructured(Refcat): """ Augment refs with data from biblio.unstructured - do this first, so we can - use it in all subsequent steps. + use it in all subsequent steps. Do some basic cleanup. """ def requires(self): return Refs() @@ -345,6 +345,7 @@ class RefsWithUnstructured(Refcat): def run(self): output = shellout(""" zstdcat -T0 {input} | + skate-cleanup -c ref | skate-from-unstructured | zstd -T0 -c > {output} """, -- cgit v1.2.3