From 73dc8ce8c0b1500ea5cb1fa177ca2ece961726fc Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Thu, 3 Jun 2021 01:47:46 +0200
Subject: add RefsWithoutIdentifiers

---
 python/refcat/tasks.py | 121 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 90 insertions(+), 31 deletions(-)

diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index 1f56e84..bc2fba1 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -51,7 +51,8 @@ build. The most common pattern is map-reduce, e.g. derive a key from docs,
 combine the results from e.g. two such key extractions and apply some
 reduction, e.g. output schema generation.
 
-# Various schema
+Various schema
+--------------
 
 * release (fatcat database export)
 * ref (one document per reference)
@@ -64,6 +65,34 @@ Some operations, e.g. "fuzzy verification" require both compared documents to
 be release entities. This means, that we need to convert different formats into
 the release format.
 
+Mappers
+-------
+
+For catalog (fatcat) and refs, we extract ids:
+
+* doi
+* pmid
+* pmcid
+* arxiv
+
+We run fuzzy title matching and verification. Here, we need to convert refs to
+releases to be able to run verify (could implement a verification for various
+schemas, too -- but release seems complete enough).
+
+For OL we need to fuse authors into the editions dataset first.
+
+Reducers
+--------
+
+Exact mode for ids:
+
+* doi
+* pmid
+* pmcid
+* arxiv
+
+For fuzzy matching, we use "fuzzy" mode (and keep only exact and strong matches).
+
 Config
 ------
 
@@ -209,6 +238,7 @@ class OpenLibraryAuthors(luigi.ExternalTask, Refcat):
 # ---------------------------------------
 #
 
+
 class RefsWithUnstructured(Refcat):
     """
     Augment refs with data from biblio.unstructured - do this first, so we can use it
@@ -257,7 +287,7 @@ class UnmatchedRefs(Refcat):
     260,749,705.
 
     Note, that this data contains refs, which have more information, just
-    hidden in unstructured.
+    hidden in "unstructured" field. XXX: We'll come back to this later.
     """
     def requires(self):
         return RefsWithUnstructured()
@@ -280,11 +310,33 @@ class UnmatchedRefs(Refcat):
         return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
 
 
+class RefsWithoutIdentifiers(Refcat):
+    def requires(self):
+        return RefsWithUnstructured()
+
+    def run(self):
+        output = shellout("""
+                          zstdcat -T0 {input} |
+                          parallel -j {n} --block 10M --pipe
+                              "jq -rc 'select(.biblio.doi == null and
+                                              .biblio.pmid == null and
+                                              .biblio.pmcid == null and
+                                              .biblio.arxiv_id == null)'" |
+                          zstd -T0 -c > {output}""",
+                          n=self.n,
+                          input=self.input().path)
+        luigi.LocalTarget(output).move(self.output().path)
+
+    def output(self):
+        return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
+
+
 #
 # Generate URL list for CDX lookup
 # --------------------------------
 #
 
+
 class URLTabs(Refcat):
     """
     Extract (work ident, release ident, url, doc). 519m45.710s (about 55k docs/s).
@@ -877,15 +929,15 @@ class OpenLibraryEditionsMapped(Refcat):
 
 
 #
-# Extra
-# -----
+# Open Library Fuzzy
+# ------------------
 #
 
 
 class UnmatchedMapped(Refcat):
     """
-    Map unmatched refs (converted to release schema on the fly) to titles to do
-    approximate title matches with OL; 35m14.801s.
+    Map unmatched refs (converted to release schema on the fly) to container
+    names to do approximate title matches with OL; 35m14.801s.
     """
     def requires(self):
         return UnmatchedRefs()
@@ -906,6 +958,38 @@ class UnmatchedMapped(Refcat):
         return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
 
 
+class UnmatchedOpenLibraryMatchTable(Refcat):
+    """
+    Run matching and write tabular results to file. About 50M rows.
+
+    """
+    def requires(self):
+        return {
+            "unmatched": UnmatchedMapped(),  # We could include a bit more here, namely records with titles.
+            "ol": OpenLibraryEditionsMapped(),
+        }
+
+    def run(self):
+        output = shellout("""
+                          skate-reduce -m oled
+                              -O <(zstdcat -T0 {ol})
+                              -F <(zstdcat -T0 {unmatched}) |
+                          zstd -c > {output}
+                          """,
+                          ol=self.input().get("ol").path,
+                          unmatched=self.input().get("unmatched").path)
+        luigi.LocalTarget(output).move(self.output().path)
+
+    def output(self):
+        return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
+
+
+#
+# Extra
+# -----
+#
+
+
 class UnmatchedRefsToRelease(Refcat):
     """
     Convert unmatched refs to releases.
@@ -969,28 +1053,3 @@ class UnmatchedResolveJournalNamesMapped(Refcat):
 
     def output(self):
         return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-
-
-class UnmatchedOpenLibraryMatchTable(Refcat):
-    """
-    Run matching and write tabular results to file. About 50M rows.
-    """
-    def requires(self):
-        return {
-            "unmatched": UnmatchedMapped(),
-            "ol": OpenLibraryEditionsMapped(),
-        }
-
-    def run(self):
-        output = shellout("""
-                          skate-reduce -m oled
-                              -O <(zstdcat -T0 {ol})
-                              -F <(zstdcat -T0 {unmatched}) |
-                          zstd -c > {output}
-                          """,
-                          ol=self.input().get("ol").path,
-                          unmatched=self.input().get("unmatched").path)
-        luigi.LocalTarget(output).move(self.output().path)
-
-    def output(self):
-        return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-- 
cgit v1.2.3