diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/refcat/tasks.py | 43 |
1 files changed, 43 insertions, 0 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 4ae21fe..0db6f86 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -300,6 +300,27 @@ class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat): Dataset contains parquet, but we want JSON here: $ parquet-tools cat --json minimal_dataset.parquet > minimal_dataset.json + + Contains (07/2021) around 29276667 rows. + + Rough id type distribution: + + 2160819 ISBN + 1442176 DOI + 825970 PMID + 353425 ISSN + 279369 PMC + 185742 OCLC + 181375 BIBCODE + 110921 JSTOR + 47601 ARXIV + 15202 LCCN + 12878 MR + 8270 ASIN + 6293 OL + 3790 SSRN + 3013 ZBL + """ def output(self): return luigi.LocalTarget(path=os.path.join(settings.WIKIPEDIA_CITATIONS, "minimal_dataset.json")) @@ -1385,3 +1406,25 @@ class UnmatchedResolveJournalNamesMapped(Refcat): def output(self): return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) + +# Wikipedia related tasks + +class WikipediaDOI(Refcat): + """ + Sorted DOI keys from wikipedia. + """ + def requires(self): + return WikipediaCitationsMinimalDataset() + + def run(self): + output = shellout(""" + skate-wikipedia-doi < {input} | + LC_ALL=C sort -T {tmpdir} -S 20% -k2,2 | + zstd -T0 -c > {output} + """, + tmpdir=self.tmpdir, + input=self.input().path) + luigi.LocalTarget(output).move(self.output().path) + + def output(self): + return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) |