From 024fd2432aacef1f2a0be634575de4d2355fcd9c Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Tue, 6 Jul 2021 23:45:48 +0200
Subject: run a parity derivation

---
 python/refcat/tasks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'python')

diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index 01f879b..ab682a0 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -244,9 +244,9 @@ class Refcat(BaseTask):
     A base tasks for all refcat related tasks.
     """
     BASE = settings.BASE
-    TAG = '2021-07-01'
+    TAG = '2021-07-06'
 
-    date = luigi.DateParameter(default=datetime.date(2021, 7, 1),
+    date = luigi.DateParameter(default=datetime.date(2021, 7, 6),
                                description="a versioning help, will be part of filename, change this manually")
     tmpdir = luigi.Parameter(default=settings.TMPDIR, description="set tempdir", significant=False)
     n = luigi.IntParameter(default=multiprocessing.cpu_count(), significant=False)
-- 
cgit v1.2.3


From d2e8720b512d2bf4a22ab7c15b71b7eda10024c6 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Tue, 6 Jul 2021 23:51:33 +0200
Subject: do not compress sort tmp files

---
 python/refcat/tasks.py | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

(limited to 'python')

diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index ab682a0..4ae21fe 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -457,7 +457,7 @@ class URLTabs(Refcat):
         output = shellout("""
                           zstdcat -T0 {input} |
                           skate-map -m ru -skip-on-empty 3 |
-                          LC_ALL=C sort -T {tmpdir} -k3,3 -S25% --compress-program=zstd |
+                          LC_ALL=C sort -T {tmpdir} -k3,3 -S25% |
                           zstd -T0 -c > {output}
                           """,
                           n=self.n,
@@ -480,7 +480,7 @@ class URLTabsCleaned(Refcat):
         output = shellout("""
                           zstdcat -T0 {input} |
                           skate-cleanup -c url -allow http,https -X -B -S -f 3 |
-                          LC_ALL=C sort -T {tmpdir} -k3,3 -S25% --compress-program=zstd |
+                          LC_ALL=C sort -T {tmpdir} -k3,3 -S25% |
                           zstd -T0 -c > {output}
                           """,
                           n=self.n,
@@ -504,7 +504,7 @@ class URLList(Refcat):
                           zstdcat -T0 {input} |
                           cut -f 3 |
                           skate-cleanup -X -c url -B -S -f 1 |
-                          LC_ALL=C sort -u -T {tmpdir} -k1,1 -S25% --compress-program=zstd |
+                          LC_ALL=C sort -u -T {tmpdir} -k1,1 -S25% |
                           LC_ALL=C grep -E '^https?://' |
                           zstd -T0 -c > {output}
                           """,
@@ -535,7 +535,7 @@ class RefsDOI(Refcat):
                           zstdcat -T0 {input} |
                           skate-map -m ff -x biblio.doi -skip-on-empty 1 |
                           skate-cleanup -S -c doi -f 1 |
-                          LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --compress-program=zstd |
+                          LC_ALL=C sort -T {tmpdir} -k1,1 -S25% |
                           zstd -T0 -c > {output}
                           """,
                           n=self.n,
@@ -559,7 +559,7 @@ class RefsPMID(Refcat):
         output = shellout("""
                           zstdcat -T0 {input} |
                           skate-map -m ff -x biblio.pmid -skip-on-empty 1 |
-                          LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --compress-program=zstd |
+                          LC_ALL=C sort -T {tmpdir} -k1,1 -S25% |
                           zstd -T0 -c > {output}
                           """,
                           n=self.n,
@@ -583,7 +583,7 @@ class RefsPMCID(Refcat):
         output = shellout("""
                           zstdcat -T0 {input} |
                           skate-map -m ff -x biblio.pmcid -skip-on-empty 1 |
-                          LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --compress-program=zstd |
+                          LC_ALL=C sort -T {tmpdir} -k1,1 -S25% |
                           zstd -T0 -c > {output}
                           """,
                           n=self.n,
@@ -606,7 +606,7 @@ class RefsArxiv(Refcat):
         output = shellout("""
                           zstdcat -T0 {input} |
                           skate-map -m ff -x biblio.arxiv_id -skip-on-empty 1 |
-                          LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --compress-program=zstd |
+                          LC_ALL=C sort -T {tmpdir} -k1,1 -S25% |
                           zstd -T0 -c > {output}
                           """,
                           n=self.n,
@@ -636,7 +636,7 @@ class FatcatDOI(Refcat):
                           zstdcat -T0 {input} |
                           skate-map -m ff -x ext_ids.doi -skip-on-empty 1 |
                           skate-cleanup -S -c doi -f 1 |
-                          LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --compress-program=zstd |
+                          LC_ALL=C sort -T {tmpdir} -k1,1 -S25% |
                           zstd -T0 -c > {output}
                           """,
                           n=self.n,
@@ -659,7 +659,7 @@ class FatcatPMID(Refcat):
         output = shellout("""
                           zstdcat -T0 {input} |
                           skate-map -m ff -x ext_ids.pmid -skip-on-empty 1 |
-                          LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --compress-program=zstd |
+                          LC_ALL=C sort -T {tmpdir} -k1,1 -S25% |
                           zstd -T0 -c > {output}
                           """,
                           n=self.n,
@@ -682,7 +682,7 @@ class FatcatPMCID(Refcat):
         output = shellout("""
                           zstdcat -T0 {input} |
                           skate-map -m ff -x ext_ids.pmcid -skip-on-empty 1 |
-                          LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --compress-program=zstd |
+                          LC_ALL=C sort -T {tmpdir} -k1,1 -S25% |
                           zstd -T0 -c > {output}
                           """,
                           n=self.n,
@@ -705,7 +705,7 @@ class FatcatArxiv(Refcat):
         output = shellout("""
                           zstdcat -T0 {input} |
                           skate-map -m ff -x extra.arxiv.base_id -skip-on-empty 1 |
-                          LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --compress-program=zstd |
+                          LC_ALL=C sort -T {tmpdir} -k1,1 -S25% |
                           zstd -T0 -c > {output}
                           """,
                           n=self.n,
@@ -736,7 +736,7 @@ class FatcatMapped(Refcat):
         output = shellout("""
                           zstdcat -T0 {input} |
                           skate-map -m {mapper} -skip-on-empty 1 |
-                          LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --compress-program=zstd |
+                          LC_ALL=C sort -T {tmpdir} -k1,1 -S25% |
                           zstd -T0 -c > {output}
                           """,
                           mapper=self.mapper,
@@ -782,7 +782,7 @@ class RefsMapped(Refcat):
         output = shellout("""
                           zstdcat -T0 {input} |
                           skate-map -m {mapper} -skip-on-empty 1 |
-                          LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --compress-program=zstd |
+                          LC_ALL=C sort -T {tmpdir} -k1,1 -S25% |
                           zstd -T0 -c > {output}
                           """,
                           n=self.n,
@@ -943,7 +943,7 @@ class OpenLibraryEditionsByWork(Refcat):
                           zstdcat -T0 {input} |
                           cut -f 5 |
                           skate-map -skip-on-empty 1 -m ff -x 'works.0.key' |
-                          LC_ALL=C sort -T {tmpdir} -S25% -k1,1 --compress-program=zstd |
+                          LC_ALL=C sort -T {tmpdir} -S25% -k1,1 |
                           zstd -T0 -c > {output}
                           """,
                           tmpdir=self.tmpdir,
@@ -965,7 +965,7 @@ class OpenLibraryWorksSorted(Refcat):
         output = shellout("""
                           zstdcat -T0 {input} |
                           cut -f 2,5 |
-                          LC_ALL=C sort -T {tmpdir} -S25% -k1,1 --compress-program=zstd |
+                          LC_ALL=C sort -T {tmpdir} -S25% -k1,1 |
                           zstd -T0 -c > {output}
                           """,
                           tmpdir=self.tmpdir,
@@ -1047,7 +1047,7 @@ class OpenLibraryEditionsMapped(Refcat):
         output = shellout("""
                           zstdcat -T0 {input} |
                           skate-map -m {mapper} -skip-on-empty 1 |
-                          LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --compress-program=zstd |
+                          LC_ALL=C sort -T {tmpdir} -k1,1 -S25% |
                           zstd -T0 -c > {output}
                           """,
                           n=self.n,
@@ -1102,7 +1102,7 @@ class UnmatchedMapped(Refcat):
                           zstdcat -T0 {input} |
                           skate-conv -f ref |
                           skate-map -m rcns -skip-on-empty 1 |
-                          LC_ALL=C sort -T {tmpdir} -S25% -k1,1 --compress-program=zstd |
+                          LC_ALL=C sort -T {tmpdir} -S25% -k1,1 |
                           zstd -T0 -c > {output}
                           """,
                           tmpdir=self.tmpdir,
@@ -1185,7 +1185,7 @@ class OpenLibraryReleaseMapped(Refcat):
         output = shellout("""
                           zstdcat -T0 {input} |
                           skate-map -m {mapper} -skip-on-empty 1 |
-                          LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --compress-program=zstd |
+                          LC_ALL=C sort -T {tmpdir} -k1,1 -S25% |
                           zstd -T0 -c > {output}
                           """,
                           mapper=self.mapper,
@@ -1247,7 +1247,7 @@ class BrefSortedByWorkID(Refcat):
         output = shellout("""
                  zstdcat -T0 {bref} |
                  skate-map -B -m ff -x source_work_ident |
-                 LC_ALL=C sort -T {tmpdir} -S25% -k1,1 --compress-program=zstd | zstd -c -T0 > {output}
+                 LC_ALL=C sort -T {tmpdir} -S25% -k1,1 | zstd -c -T0 > {output}
                  """,
                           tmpdir=self.tmpdir,
                           bref=self.input().path)
@@ -1271,7 +1271,7 @@ class RefsByWorkID(Refcat):
         output = shellout("""
                           zstdcat -T0 {input} |
                           skate-map -m ff -x work_ident |
-                          LC_ALL=C sort -T {tmpdir} -S25% -k1,1 --compress-program=zstd |
+                          LC_ALL=C sort -T {tmpdir} -S25% -k1,1 |
                           zstd -c -T0 > {output}
                           """,
                           tmpdir=self.tmpdir,
@@ -1376,7 +1376,7 @@ class UnmatchedResolveJournalNamesMapped(Refcat):
         output = shellout("""
                           zstdcat -T0 {input} |
                           skate-map -m vcns -skip-on-empty 1 |
-                          LC_ALL=C sort -T {tmpdir} -k1,1 -S25% --compress-program=zstd |
+                          LC_ALL=C sort -T {tmpdir} -k1,1 -S25% |
                           zstd -T0 -c > {output}
                           """,
                           tmpdir=self.tmpdir,
-- 
cgit v1.2.3


From df8d801b0b7227d24e9508cfc2474859ee584d2a Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Wed, 7 Jul 2021 21:27:43 +0200
Subject: add WikipediaDOI

---
 python/refcat/tasks.py | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

(limited to 'python')

diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index 4ae21fe..0db6f86 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -300,6 +300,27 @@ class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat):
     Dataset contains parquet, but we want JSON here:
 
     $ parquet-tools cat --json minimal_dataset.parquet > minimal_dataset.json
+
+    Contains (07/2021) around 29276667 rows.
+
+    Rough id type distribution:
+
+        2160819 ISBN
+        1442176 DOI
+         825970 PMID
+         353425 ISSN
+         279369 PMC
+         185742 OCLC
+         181375 BIBCODE
+         110921 JSTOR
+          47601 ARXIV
+          15202 LCCN
+          12878 MR
+           8270 ASIN
+           6293 OL
+           3790 SSRN
+           3013 ZBL
+
     """
     def output(self):
         return luigi.LocalTarget(path=os.path.join(settings.WIKIPEDIA_CITATIONS, "minimal_dataset.json"))
@@ -1385,3 +1406,25 @@ class UnmatchedResolveJournalNamesMapped(Refcat):
 
     def output(self):
         return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
+
+# Wikipedia related tasks
+
+class WikipediaDOI(Refcat):
+    """
+    Sorted DOI keys from wikipedia.
+    """
+    def requires(self):
+        return WikipediaCitationsMinimalDataset()
+
+    def run(self):
+        output = shellout("""
+                          skate-wikipedia-doi < {input} |
+                          LC_ALL=C sort -T {tmpdir} -S 20% -k2,2 |
+                          zstd -T0 -c > {output}
+                          """,
+                          tmpdir=self.tmpdir,
+                          input=self.input().path)
+        luigi.LocalTarget(output).move(self.output().path)
+
+    def output(self):
+        return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-- 
cgit v1.2.3