aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/refcat/tasks.py43
1 files changed, 43 insertions, 0 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index 4ae21fe..0db6f86 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -300,6 +300,27 @@ class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat):
Dataset contains parquet, but we want JSON here:
$ parquet-tools cat --json minimal_dataset.parquet > minimal_dataset.json
+
+ Contains (07/2021) around 29276667 rows.
+
+ Rough id type distribution:
+
+ 2160819 ISBN
+ 1442176 DOI
+ 825970 PMID
+ 353425 ISSN
+ 279369 PMC
+ 185742 OCLC
+ 181375 BIBCODE
+ 110921 JSTOR
+ 47601 ARXIV
+ 15202 LCCN
+ 12878 MR
+ 8270 ASIN
+ 6293 OL
+ 3790 SSRN
+ 3013 ZBL
+
"""
def output(self):
return luigi.LocalTarget(path=os.path.join(settings.WIKIPEDIA_CITATIONS, "minimal_dataset.json"))
@@ -1385,3 +1406,25 @@ class UnmatchedResolveJournalNamesMapped(Refcat):
def output(self):
return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
+
+# Wikipedia related tasks
+
+class WikipediaDOI(Refcat):
+ """
+ Sorted DOI keys from wikipedia.
+ """
+ def requires(self):
+ return WikipediaCitationsMinimalDataset()
+
+ def run(self):
+ output = shellout("""
+ skate-wikipedia-doi < {input} |
+ LC_ALL=C sort -T {tmpdir} -S 20% -k2,2 |
+ zstd -T0 -c > {output}
+ """,
+ tmpdir=self.tmpdir,
+ input=self.input().path)
+ luigi.LocalTarget(output).move(self.output().path)
+
+ def output(self):
+ return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)