aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-08-05 01:35:34 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-08-05 01:35:34 +0200
commitc8b2a4c63064c0096cabc6181974c3f0b46dcafb (patch)
tree37304e5b7061efabcdcd50c478c5ba12220751f9 /python
parente4690545cbef7a83b03010a8aa5a1bc8ecf5111d (diff)
downloadrefcat-c8b2a4c63064c0096cabc6181974c3f0b46dcafb.tar.gz
refcat-c8b2a4c63064c0096cabc6181974c3f0b46dcafb.zip
tasks: basic cleanup
e.g. filter out docs with too many dots: https://qa.fatcat.wiki/release/gu2khq347rgonfttti7xy3vwiu/refs-out
Diffstat (limited to 'python')
-rw-r--r--python/refcat/tasks.py3
1 files changed, 2 insertions, 1 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index 43eb4b2..75a3daa 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -337,7 +337,7 @@ class OpenCitations(luigi.ExternalTask, Refcat):
class RefsWithUnstructured(Refcat):
"""
Augment refs with data from biblio.unstructured - do this first, so we can
- use it in all subsequent steps.
+ use it in all subsequent steps. Do some basic cleanup.
"""
def requires(self):
return Refs()
@@ -345,6 +345,7 @@ class RefsWithUnstructured(Refcat):
def run(self):
output = shellout("""
zstdcat -T0 {input} |
+ skate-cleanup -c ref |
skate-from-unstructured |
zstd -T0 -c > {output}
""",