aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-07-07 23:34:28 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-07-07 23:34:28 +0200
commit7a5fbfc41c8c71576e4788c7ba891979c6f5f1a8 (patch)
treed72f36377a2362104deb71f342172812459b2347 /python
parent9b089b324d48e6c5d02d7f70adb585cde263f1e4 (diff)
parent9ea69942a54f1c2e13f058ba35279af3612add1b (diff)
downloadrefcat-7a5fbfc41c8c71576e4788c7ba891979c6f5f1a8.tar.gz
refcat-7a5fbfc41c8c71576e4788c7ba891979c6f5f1a8.zip
fix merge conflict
Diffstat (limited to 'python')
-rw-r--r--python/refcat/tasks.py47
1 files changed, 45 insertions, 2 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index 48c4226..0db6f86 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -244,9 +244,9 @@ class Refcat(BaseTask):
A base tasks for all refcat related tasks.
"""
BASE = settings.BASE
- TAG = '2021-07-01'
+ TAG = '2021-07-06'
- date = luigi.DateParameter(default=datetime.date(2021, 7, 1),
+ date = luigi.DateParameter(default=datetime.date(2021, 7, 6),
description="a versioning help, will be part of filename, change this manually")
tmpdir = luigi.Parameter(default=settings.TMPDIR, description="set tempdir", significant=False)
n = luigi.IntParameter(default=multiprocessing.cpu_count(), significant=False)
@@ -300,6 +300,27 @@ class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat):
Dataset contains parquet, but we want JSON here:
$ parquet-tools cat --json minimal_dataset.parquet > minimal_dataset.json
+
+ Contains (07/2021) around 29276667 rows.
+
+ Rough id type distribution:
+
+ 2160819 ISBN
+ 1442176 DOI
+ 825970 PMID
+ 353425 ISSN
+ 279369 PMC
+ 185742 OCLC
+ 181375 BIBCODE
+ 110921 JSTOR
+ 47601 ARXIV
+ 15202 LCCN
+ 12878 MR
+ 8270 ASIN
+ 6293 OL
+ 3790 SSRN
+ 3013 ZBL
+
"""
def output(self):
return luigi.LocalTarget(path=os.path.join(settings.WIKIPEDIA_CITATIONS, "minimal_dataset.json"))
@@ -1385,3 +1406,25 @@ class UnmatchedResolveJournalNamesMapped(Refcat):
def output(self):
return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
+
+# Wikipedia related tasks
+
+class WikipediaDOI(Refcat):
+ """
+ Sorted DOI keys from wikipedia.
+ """
+ def requires(self):
+ return WikipediaCitationsMinimalDataset()
+
+ def run(self):
+ output = shellout("""
+ skate-wikipedia-doi < {input} |
+ LC_ALL=C sort -T {tmpdir} -S 20% -k2,2 |
+ zstd -T0 -c > {output}
+ """,
+ tmpdir=self.tmpdir,
+ input=self.input().path)
+ luigi.LocalTarget(output).move(self.output().path)
+
+ def output(self):
+ return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)