diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-08-05 01:35:34 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-08-05 01:35:34 +0200 |
commit | c8b2a4c63064c0096cabc6181974c3f0b46dcafb (patch) | |
tree | 37304e5b7061efabcdcd50c478c5ba12220751f9 /python | |
parent | e4690545cbef7a83b03010a8aa5a1bc8ecf5111d (diff) | |
download | refcat-c8b2a4c63064c0096cabc6181974c3f0b46dcafb.tar.gz refcat-c8b2a4c63064c0096cabc6181974c3f0b46dcafb.zip |
tasks: basic cleanup
e.g. filter out docs with too many dots:
https://qa.fatcat.wiki/release/gu2khq347rgonfttti7xy3vwiu/refs-out
Diffstat (limited to 'python')
-rw-r--r-- | python/refcat/tasks.py | 3 |
1 files changed, 2 insertions, 1 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 43eb4b2..75a3daa 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -337,7 +337,7 @@ class OpenCitations(luigi.ExternalTask, Refcat): class RefsWithUnstructured(Refcat): """ Augment refs with data from biblio.unstructured - do this first, so we can - use it in all subsequent steps. + use it in all subsequent steps. Do some basic cleanup. """ def requires(self): return Refs() @@ -345,6 +345,7 @@ class RefsWithUnstructured(Refcat): def run(self): output = shellout(""" zstdcat -T0 {input} | + skate-cleanup -c ref | skate-from-unstructured | zstd -T0 -c > {output} """, |