diff options
-rw-r--r-- | python/refcat/tasks.py | 58 |
1 files changed, 52 insertions, 6 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 8459608..5eaca50 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -177,6 +177,7 @@ class MAGPapers(luigi.ExternalTask, Refcat): def output(self): return luigi.LocalTarget(path=os.path.join(settings.MAG, "Papers.txt.gz"), format=Zstd) + class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat): """ From archive.org/details/wikipedia_citations_2020-07-14 (Wikipedia @@ -193,6 +194,7 @@ class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat): # ----8< Derivations + class RefsWithUnstructured(Refcat): """ Augment refs with data from unstructured. Do this first, so we can use it @@ -1097,8 +1099,6 @@ class RefsFatcatClusters(Refcat): return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) - - # ==== new style zippy biblioref generation @@ -1228,7 +1228,8 @@ class BiblioRefV2(Refcat): skate-bref-id | zstd -T0 >> {output} """, - input=target.path, output=tmpf) + input=target.path, + output=tmpf) luigi.LocalTarget(tmpf).move(self.output().path) def output(self): @@ -1317,8 +1318,10 @@ class RGSitemapFatcatSortedKeys(Refcat): def output(self): return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) + # ==== MAG + class MAGDOI(Refcat): """ List of MAG DOI. @@ -1339,13 +1342,14 @@ class MAGDOI(Refcat): def output(self): return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) + # ==== WikipediaCitations + class BiblioRefWikiDOISortedKeys(Refcat): """ Sorted DOI keys from wikipedia. """ - def requires(self): return WikipediaCitationsMinimalDataset() @@ -1355,14 +1359,15 @@ class BiblioRefWikiDOISortedKeys(Refcat): skate-wikipedia-doi | LC_ALL=C sort -S 10% -k2,2 | zstd -T0 -c > {output} - """, input=self.input().path) + """, + input=self.input().path) luigi.LocalTarget(output).move(self.output().path) def output(self): return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) -class BiblioRefWiki(Refcat): +class BiblioRefWiki(Refcat): def requires(self): return { "wiki": BiblioRefWikiDOISortedKeys(), @@ -1380,3 +1385,44 @@ class BiblioRefWiki(Refcat): def output(self): return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) + + +# ==== Prepare unmatched + + +class BibliorefSortedIdent(Refcat): + def requires(self): + return BiblioRefV2() + + def run(self): + output = shellout(""" + zstdcat -T0 {input} | + skate-derive-key -b 50000 -verbose -F source_release_ident | + LC_ALL=C sort -k2,2 -S 35% --parallel 6 --compress-program pzstd -T {tmpdir} | + zstd -T0 -c > {output} + """, + tmpdir=self.tmpdir, + input=self.input().path) + luigi.LocalTarget(output).move(self.output().path) + + def output(self): + return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) + + +class RefSortedIdent(Refcat): + def requires(self): + return RefsWithUnstructured() + + def run(self): + output = shellout(""" + zstdcat -T0 {input} | + skate-derive-key -b 50000 -verbose -F release_ident | + LC_ALL=C sort -k2,2 -S 35% --parallel 6 --compress-program pzstd -T {tmpdir} | + zstd -T0 -c > {output} + """, + tmpdir=self.tmpdir, + input=self.input().path) + luigi.LocalTarget(output).move(self.output().path) + + def output(self): + return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) |