diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-07-07 23:34:28 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-07-07 23:34:28 +0200 |
commit | 7a5fbfc41c8c71576e4788c7ba891979c6f5f1a8 (patch) | |
tree | d72f36377a2362104deb71f342172812459b2347 /python | |
parent | 9b089b324d48e6c5d02d7f70adb585cde263f1e4 (diff) | |
parent | 9ea69942a54f1c2e13f058ba35279af3612add1b (diff) | |
download | refcat-7a5fbfc41c8c71576e4788c7ba891979c6f5f1a8.tar.gz refcat-7a5fbfc41c8c71576e4788c7ba891979c6f5f1a8.zip |
fix merge conflict
Diffstat (limited to 'python')
-rw-r--r-- | python/refcat/tasks.py | 47 |
1 files changed, 45 insertions, 2 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 48c4226..0db6f86 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -244,9 +244,9 @@ class Refcat(BaseTask): A base tasks for all refcat related tasks. """ BASE = settings.BASE - TAG = '2021-07-01' + TAG = '2021-07-06' - date = luigi.DateParameter(default=datetime.date(2021, 7, 1), + date = luigi.DateParameter(default=datetime.date(2021, 7, 6), description="a versioning help, will be part of filename, change this manually") tmpdir = luigi.Parameter(default=settings.TMPDIR, description="set tempdir", significant=False) n = luigi.IntParameter(default=multiprocessing.cpu_count(), significant=False) @@ -300,6 +300,27 @@ class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat): Dataset contains parquet, but we want JSON here: $ parquet-tools cat --json minimal_dataset.parquet > minimal_dataset.json + + Contains (07/2021) around 29276667 rows. + + Rough id type distribution: + + 2160819 ISBN + 1442176 DOI + 825970 PMID + 353425 ISSN + 279369 PMC + 185742 OCLC + 181375 BIBCODE + 110921 JSTOR + 47601 ARXIV + 15202 LCCN + 12878 MR + 8270 ASIN + 6293 OL + 3790 SSRN + 3013 ZBL + """ def output(self): return luigi.LocalTarget(path=os.path.join(settings.WIKIPEDIA_CITATIONS, "minimal_dataset.json")) @@ -1385,3 +1406,25 @@ class UnmatchedResolveJournalNamesMapped(Refcat): def output(self): return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) + +# Wikipedia related tasks + +class WikipediaDOI(Refcat): + """ + Sorted DOI keys from wikipedia. + """ + def requires(self): + return WikipediaCitationsMinimalDataset() + + def run(self): + output = shellout(""" + skate-wikipedia-doi < {input} | + LC_ALL=C sort -T {tmpdir} -S 20% -k2,2 | + zstd -T0 -c > {output} + """, + tmpdir=self.tmpdir, + input=self.input().path) + luigi.LocalTarget(output).move(self.output().path) + + def output(self): + return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) |