tasks: update docs

author: Martin Czygan <martin.czygan@gmail.com> 2021-07-23 11:11:44 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2021-07-23 11:11:44 +0200
commit: af405a9ae8bc4a5530dce6e17e6fc41bab4c7403 (patch)
tree: 18e9ae04971afc9a1b571a3f5ce7c2a703b7e64c /python
parent: 172b10ccb9f0e3bdf6985fc56e71db956cc28a6a (diff)
download: refcat-af405a9ae8bc4a5530dce6e17e6fc41bab4c7403.tar.gz
refcat-af405a9ae8bc4a5530dce6e17e6fc41bab4c7403.zip
1 files changed, 15 insertions, 64 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index c798272..42fa924 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+
 """
 Set of luigi tasks to derive a citation graph.
 
@@ -57,58 +58,6 @@ Set of luigi tasks to derive a citation graph.
 
 ------------------------------------------------------------------------
 
-Deps (2021-06-07) for final "bref" file:
-
- \_ Bref(date=2021-05-06)
-    \_ BrefZipDOI(date=2021-05-06)
-       \_ FatcatDOI(date=2021-05-06)
-          \_ ReleaseExportReduced(date=2021-05-06)
-             \_ ReleaseExportExpanded(date=2021-05-06)
-       \_ RefsDOI(date=2021-05-06)
-          \_ RefsWithUnstructured(date=2021-05-06)
-             \_ Refs(date=2021-05-06)
-    \_ BrefZipOpenLibrary(date=2021-05-06)
-       \_ OpenLibraryEditionsMapped(date=2021-05-06, mapper=ts)
-          \_ OpenLibraryEditionsToRelease(date=2021-05-06)
-             \_ OpenLibraryEditions(date=2021-05-06)
-             \_ OpenLibraryAuthorMapping(date=2021-05-06)
-                \_ OpenLibraryAuthors(date=2021-05-06)
-       \_ UnmatchedMapped(date=2021-05-06)
-          \_ RefsWithoutIdentifiers(date=2021-05-06)
-             \_ RefsWithUnstructured(date=2021-05-06)
-                \_ Refs(date=2021-05-06)
-    \_ BrefZipPMID(date=2021-05-06)
-       \_ RefsPMID(date=2021-05-06)
-          \_ RefsWithUnstructured(date=2021-05-06)
-             \_ Refs(date=2021-05-06)
-       \_ FatcatPMID(date=2021-05-06)
-          \_ ReleaseExportReduced(date=2021-05-06)
-             \_ ReleaseExportExpanded(date=2021-05-06)
-    \_ BrefZipPMCID(date=2021-05-06)
-       \_ RefsPMCID(date=2021-05-06)
-          \_ RefsWithUnstructured(date=2021-05-06)
-             \_ Refs(date=2021-05-06)
-       \_ FatcatPMCID(date=2021-05-06)
-          \_ ReleaseExportReduced(date=2021-05-06)
-             \_ ReleaseExportExpanded(date=2021-05-06)
-    \_ BrefZipArxiv(date=2021-05-06)
-       \_ RefsArxiv(date=2021-05-06)
-          \_ RefsWithUnstructured(date=2021-05-06)
-             \_ Refs(date=2021-05-06)
-       \_ FatcatArxiv(date=2021-05-06)
-          \_ ReleaseExportReduced(date=2021-05-06)
-             \_ ReleaseExportExpanded(date=2021-05-06)
-    \_ BrefZipFuzzy(date=2021-05-06, mapper=ts)
-       \_ FatcatMapped(date=2021-05-06, mapper=ts)
-          \_ ReleaseExportReduced(date=2021-05-06)
-             \_ ReleaseExportExpanded(date=2021-05-06)
-       \_ RefsMapped(date=2021-05-06, mapper=ts)
-          \_ RefsToRelease(date=2021-05-06)
-             \_ RefsWithUnstructured(date=2021-05-06)
-                \_ Refs(date=2021-05-06)
-
-------------------------------------------------------------------------
-
 Overview
 --------
 
@@ -257,9 +206,6 @@ class Refcat(BaseTask):
         return logging.getLogger('refcat')
 
 
-# ----8< Raw inputs; XXX: add wikipedia dump, mag, OCI, ...
-
-
 class Refs(luigi.ExternalTask, Refcat):
     """
     Compressed (zstd) references, as of 01/2021 containing ~1.8B docs; this
@@ -369,8 +315,8 @@ class OpenLibraryAuthors(luigi.ExternalTask, Refcat):
 
 class RefsWithUnstructured(Refcat):
     """
-    Augment refs with data from biblio.unstructured - do this first, so we can use it
-    all subsequent steps.
+    Augment refs with data from biblio.unstructured - do this first, so we can
+    use it in all subsequent steps.
     """
     def requires(self):
         return Refs()
@@ -390,7 +336,7 @@ class RefsWithUnstructured(Refcat):
 
 class ReleaseExportReduced(Refcat):
     """
-    Reduce dataset size, stripping some heavy fields. 110min.
+    Reduce fatcat exported dataset size, stripping some heavy fields (110min).
     """
     def requires(self):
         return ReleaseExportExpanded()
@@ -412,10 +358,11 @@ class ReleaseExportReduced(Refcat):
 class UnmatchedRefs(Refcat):
     """
     File with not yet considered refs (e.g. no title, doi, ...); around
-    260,749,705.
+    260,749,705. Note that this is a lower bound, since docs with titles may
+    not be matched as well.
 
     Note, that this data contains refs, which have more information, just
-    hidden in "unstructured" field. XXX: We'll come back to this later.
+    hidden in "unstructured" field. TODO: Parse all unparsed field data.
     """
     def requires(self):
         return RefsWithUnstructured()
@@ -439,6 +386,9 @@ class UnmatchedRefs(Refcat):
 
 
 class RefsWithoutIdentifiers(Refcat):
+    """
+    All references, which do not have an identifier.
+    """
     def requires(self):
         return RefsWithUnstructured()
 
@@ -467,7 +417,8 @@ class RefsWithoutIdentifiers(Refcat):
 
 class URLTabs(Refcat):
     """
-    Extract (work ident, release ident, url, doc). 519m45.710s (about 55k docs/s).
+    Extract (work ident, release ident, url, doc) from refs (519m45.710s, about
+    55k docs/s); sorted by url.
     """
     def requires(self):
         return RefsWithUnstructured()
@@ -490,7 +441,7 @@ class URLTabs(Refcat):
 
 class URLTabsCleaned(Refcat):
     """
-    URLTabs, cleaned, unsorted.
+    URLTabs, cleaned, sorted by url. Notes: https://is.gd/C7upZq
     """
     def requires(self):
         return URLTabs()
@@ -537,8 +488,8 @@ class URLList(Refcat):
 
 
 #
-# Generate (key, doc) from refs
-# -----------------------------
+# Mapping tasks
+# -------------
 #
author	Martin Czygan <martin.czygan@gmail.com>	2021-07-23 11:11:44 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2021-07-23 11:11:44 +0200
commit	af405a9ae8bc4a5530dce6e17e6fc41bab4c7403 (patch)
tree	18e9ae04971afc9a1b571a3f5ce7c2a703b7e64c /python
parent	172b10ccb9f0e3bdf6985fc56e71db956cc28a6a (diff)
download	refcat-af405a9ae8bc4a5530dce6e17e6fc41bab4c7403.tar.gz refcat-af405a9ae8bc4a5530dce6e17e6fc41bab4c7403.zip