From 024fd2432aacef1f2a0be634575de4d2355fcd9c Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 6 Jul 2021 23:45:48 +0200 Subject: run a parity derivation --- python/refcat/tasks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'python') diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 01f879b..ab682a0 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -244,9 +244,9 @@ class Refcat(BaseTask): A base tasks for all refcat related tasks. """ BASE = settings.BASE - TAG = '2021-07-01' + TAG = '2021-07-06' - date = luigi.DateParameter(default=datetime.date(2021, 7, 1), + date = luigi.DateParameter(default=datetime.date(2021, 7, 6), description="a versioning help, will be part of filename, change this manually") tmpdir = luigi.Parameter(default=settings.TMPDIR, description="set tempdir", significant=False) n = luigi.IntParameter(default=multiprocessing.cpu_count(), significant=False) -- cgit v1.2.3 From d2e8720b512d2bf4a22ab7c15b71b7eda10024c6 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 6 Jul 2021 23:51:33 +0200 Subject: do not compress sort tmp files --- python/refcat/tasks.py | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'python') diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index ab682a0..4ae21fe 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -457,7 +457,7 @@ class URLTabs(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-map -m ru -skip-on-empty 3 | - LC_ALL=C sort -T {tmpdir} -k3,3 -S25% --compress-program=zstd | + LC_ALL=C sort -T {tmpdir} -k3,3 -S25% | zstd -T0 -c > {output} """, n=self.n, @@ -480,7 +480,7 @@ class URLTabsCleaned(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-cleanup -c url -allow http,https -X -B -S -f 3 | - LC_ALL=C sort -T {tmpdir} -k3,3 -S25% --compress-program=zstd | + LC_ALL=C sort -T {tmpdir} -k3,3 -S25% | zstd -T0 -c > {output} """, n=self.n, @@ -504,7 +504,7 @@ class URLList(Refcat): zstdcat -T0 {input} | cut -f 3 | skate-cleanup -X -c url -B -S -f 1 | - LC_ALL=C sort -u -T {tmpdir} -k1,1 -S25% --compress-program=zstd | + LC_ALL=C sort -u -T {tmpdir} -k1,1 -S25% | LC_ALL=C grep -E '^https?://' | zstd -T0 -c > {output} """, @@ -535,7 +535,7 @@ class RefsDOI(Refcat): zstdcat -T0 {input} | skate-map -m ff -x biblio.doi -skip-on-empty 1 | skate-cleanup -S -c doi -f 1 | - LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --compress-program=zstd | + LC_ALL=C sort -T {tmpdir} -k1,1 -S25% | zstd -T0 -c > {output} """, n=self.n, @@ -559,7 +559,7 @@ class RefsPMID(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-map -m ff -x biblio.pmid -skip-on-empty 1 | - LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --compress-program=zstd | + LC_ALL=C sort -T {tmpdir} -k1,1 -S25% | zstd -T0 -c > {output} """, n=self.n, @@ -583,7 +583,7 @@ class RefsPMCID(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-map -m ff -x biblio.pmcid -skip-on-empty 1 | - LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --compress-program=zstd | + LC_ALL=C sort -T {tmpdir} -k1,1 -S25% | zstd -T0 -c > {output} """, n=self.n, @@ -606,7 +606,7 @@ class RefsArxiv(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-map -m ff -x biblio.arxiv_id -skip-on-empty 1 | - LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --compress-program=zstd | + LC_ALL=C sort -T {tmpdir} -k1,1 -S25% | zstd -T0 -c > {output} """, n=self.n, @@ -636,7 +636,7 @@ class FatcatDOI(Refcat): zstdcat -T0 {input} | skate-map -m ff -x ext_ids.doi -skip-on-empty 1 | skate-cleanup -S -c doi -f 1 | - LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --compress-program=zstd | + LC_ALL=C sort -T {tmpdir} -k1,1 -S25% | zstd -T0 -c > {output} """, n=self.n, @@ -659,7 +659,7 @@ class FatcatPMID(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-map -m ff -x ext_ids.pmid -skip-on-empty 1 | - LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --compress-program=zstd | + LC_ALL=C sort -T {tmpdir} -k1,1 -S25% | zstd -T0 -c > {output} """, n=self.n, @@ -682,7 +682,7 @@ class FatcatPMCID(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-map -m ff -x ext_ids.pmcid -skip-on-empty 1 | - LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --compress-program=zstd | + LC_ALL=C sort -T {tmpdir} -k1,1 -S25% | zstd -T0 -c > {output} """, n=self.n, @@ -705,7 +705,7 @@ class FatcatArxiv(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-map -m ff -x extra.arxiv.base_id -skip-on-empty 1 | - LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --compress-program=zstd | + LC_ALL=C sort -T {tmpdir} -k1,1 -S25% | zstd -T0 -c > {output} """, n=self.n, @@ -736,7 +736,7 @@ class FatcatMapped(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-map -m {mapper} -skip-on-empty 1 | - LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --compress-program=zstd | + LC_ALL=C sort -T {tmpdir} -k1,1 -S25% | zstd -T0 -c > {output} """, mapper=self.mapper, @@ -782,7 +782,7 @@ class RefsMapped(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-map -m {mapper} -skip-on-empty 1 | - LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --compress-program=zstd | + LC_ALL=C sort -T {tmpdir} -k1,1 -S25% | zstd -T0 -c > {output} """, n=self.n, @@ -943,7 +943,7 @@ class OpenLibraryEditionsByWork(Refcat): zstdcat -T0 {input} | cut -f 5 | skate-map -skip-on-empty 1 -m ff -x 'works.0.key' | - LC_ALL=C sort -T {tmpdir} -S25% -k1,1 --compress-program=zstd | + LC_ALL=C sort -T {tmpdir} -S25% -k1,1 | zstd -T0 -c > {output} """, tmpdir=self.tmpdir, @@ -965,7 +965,7 @@ class OpenLibraryWorksSorted(Refcat): output = shellout(""" zstdcat -T0 {input} | cut -f 2,5 | - LC_ALL=C sort -T {tmpdir} -S25% -k1,1 --compress-program=zstd | + LC_ALL=C sort -T {tmpdir} -S25% -k1,1 | zstd -T0 -c > {output} """, tmpdir=self.tmpdir, @@ -1047,7 +1047,7 @@ class OpenLibraryEditionsMapped(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-map -m {mapper} -skip-on-empty 1 | - LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --compress-program=zstd | + LC_ALL=C sort -T {tmpdir} -k1,1 -S25% | zstd -T0 -c > {output} """, n=self.n, @@ -1102,7 +1102,7 @@ class UnmatchedMapped(Refcat): zstdcat -T0 {input} | skate-conv -f ref | skate-map -m rcns -skip-on-empty 1 | - LC_ALL=C sort -T {tmpdir} -S25% -k1,1 --compress-program=zstd | + LC_ALL=C sort -T {tmpdir} -S25% -k1,1 | zstd -T0 -c > {output} """, tmpdir=self.tmpdir, @@ -1185,7 +1185,7 @@ class OpenLibraryReleaseMapped(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-map -m {mapper} -skip-on-empty 1 | - LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --compress-program=zstd | + LC_ALL=C sort -T {tmpdir} -k1,1 -S25% | zstd -T0 -c > {output} """, mapper=self.mapper, @@ -1247,7 +1247,7 @@ class BrefSortedByWorkID(Refcat): output = shellout(""" zstdcat -T0 {bref} | skate-map -B -m ff -x source_work_ident | - LC_ALL=C sort -T {tmpdir} -S25% -k1,1 --compress-program=zstd | zstd -c -T0 > {output} + LC_ALL=C sort -T {tmpdir} -S25% -k1,1 | zstd -c -T0 > {output} """, tmpdir=self.tmpdir, bref=self.input().path) @@ -1271,7 +1271,7 @@ class RefsByWorkID(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-map -m ff -x work_ident | - LC_ALL=C sort -T {tmpdir} -S25% -k1,1 --compress-program=zstd | + LC_ALL=C sort -T {tmpdir} -S25% -k1,1 | zstd -c -T0 > {output} """, tmpdir=self.tmpdir, @@ -1376,7 +1376,7 @@ class UnmatchedResolveJournalNamesMapped(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-map -m vcns -skip-on-empty 1 | - LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --compress-program=zstd | + LC_ALL=C sort -T {tmpdir} -k1,1 -S25% | zstd -T0 -c > {output} """, tmpdir=self.tmpdir, -- cgit v1.2.3 From df8d801b0b7227d24e9508cfc2474859ee584d2a Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 7 Jul 2021 21:27:43 +0200 Subject: add WikipediaDOI --- python/refcat/tasks.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'python') diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 4ae21fe..0db6f86 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -300,6 +300,27 @@ class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat): Dataset contains parquet, but we want JSON here: $ parquet-tools cat --json minimal_dataset.parquet > minimal_dataset.json + + Contains (07/2021) around 29276667 rows. + + Rough id type distribution: + + 2160819 ISBN + 1442176 DOI + 825970 PMID + 353425 ISSN + 279369 PMC + 185742 OCLC + 181375 BIBCODE + 110921 JSTOR + 47601 ARXIV + 15202 LCCN + 12878 MR + 8270 ASIN + 6293 OL + 3790 SSRN + 3013 ZBL + """ def output(self): return luigi.LocalTarget(path=os.path.join(settings.WIKIPEDIA_CITATIONS, "minimal_dataset.json")) @@ -1385,3 +1406,25 @@ class UnmatchedResolveJournalNamesMapped(Refcat): def output(self): return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) + +# Wikipedia related tasks + +class WikipediaDOI(Refcat): + """ + Sorted DOI keys from wikipedia. + """ + def requires(self): + return WikipediaCitationsMinimalDataset() + + def run(self): + output = shellout(""" + skate-wikipedia-doi < {input} | + LC_ALL=C sort -T {tmpdir} -S 20% -k2,2 | + zstd -T0 -c > {output} + """, + tmpdir=self.tmpdir, + input=self.input().path) + luigi.LocalTarget(output).move(self.output().path) + + def output(self): + return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) -- cgit v1.2.3