From 3cd6d9661bfc53df1f54f9bff484c3362ad49b02 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Sun, 21 Mar 2021 01:51:18 +0100 Subject: pipe refs through unstructured parser --- python/refcat/tasks.py | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) (limited to 'python') diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 5e590a1..c6faece 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -172,6 +172,26 @@ class ReleaseExportExpanded(luigi.ExternalTask, Refcat): # ----8< Derivations +class RefsWithUnstructured(Refcat): + """ + Augment refs with data from unstructured. Do this first, so we can use it + all subsequent steps. + """ + def requires(self): + return Refs() + + def run(self): + output = shellout(""" + zstdcat -T0 {input} | + skate-from-unstructured | + zstd -T0 -c9 > {output} + """, + input=self.input().path) + luigi.LocalTarget(output).move(self.output().path) + + def output(self): + return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) + class ReleaseExportReduced(Refcat): """ @@ -224,7 +244,7 @@ class URLTabs(Refcat): sys 85m54.710s """ def requires(self): - return Refs() + return RefsWithUnstructured() def run(self): output = shellout(""" @@ -272,7 +292,7 @@ class RefsDOI(Refcat): TSV with (ident, doi, full doc). """ def requires(self): - return Refs() + return RefsWithUnstructured() def run(self): """ @@ -305,7 +325,7 @@ class RefsPMID(Refcat): List of PMID, 74M refs seem to have one. """ def requires(self): - return Refs() + return RefsWithUnstructured() def run(self): output = shellout(r""" @@ -329,7 +349,7 @@ class RefsPMCID(Refcat): List of PMCID. """ def requires(self): - return Refs() + return RefsWithUnstructured() def run(self): output = shellout(r""" @@ -354,7 +374,7 @@ class RefsArxiv(Refcat): List of arxiv ids from refs. """ def requires(self): - return Refs() + return RefsWithUnstructured() def run(self): output = shellout(r""" @@ -381,7 +401,7 @@ class RefsTitles(Refcat): RNA!editing!in!isolated!Physarum!mitochondria.!RNA* """ def requires(self): - return Refs() + return RefsWithUnstructured() def run(self): output = shellout(r""" @@ -836,7 +856,7 @@ class RefsCounter(Refcat): Key counts, see: ref_counter.py. """ def requires(self): - return Refs() + return RefsWithUnstructured() def run(self): counts = collections.Counter() @@ -883,7 +903,7 @@ class RefsKeyStats(Refcat): How many titles, DOI, etc. do we have in refs? """ def requires(self): - return Refs() + return RefsWithUnstructured() def run(self): stats = { @@ -917,7 +937,7 @@ class RefsToRelease(Refcat): tools - XXX: polish. """ def requires(self): - return Refs() + return RefsWithUnstructured() def run(self): output = shellout(""" @@ -1290,7 +1310,7 @@ class UnmatchedRefs(Refcat): File with not yet considered refs (e.g. no title, doi, ...) """ def requires(self): - return Refs() + return RefsWithUnstructured() def run(self): output = shellout(""" -- cgit v1.2.3