aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/refcat/tasks.py79
1 files changed, 15 insertions, 64 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index c798272..42fa924 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -1,4 +1,5 @@
#!/usr/bin/env python3
+
"""
Set of luigi tasks to derive a citation graph.
@@ -57,58 +58,6 @@ Set of luigi tasks to derive a citation graph.
------------------------------------------------------------------------
-Deps (2021-06-07) for final "bref" file:
-
- \_ Bref(date=2021-05-06)
- \_ BrefZipDOI(date=2021-05-06)
- \_ FatcatDOI(date=2021-05-06)
- \_ ReleaseExportReduced(date=2021-05-06)
- \_ ReleaseExportExpanded(date=2021-05-06)
- \_ RefsDOI(date=2021-05-06)
- \_ RefsWithUnstructured(date=2021-05-06)
- \_ Refs(date=2021-05-06)
- \_ BrefZipOpenLibrary(date=2021-05-06)
- \_ OpenLibraryEditionsMapped(date=2021-05-06, mapper=ts)
- \_ OpenLibraryEditionsToRelease(date=2021-05-06)
- \_ OpenLibraryEditions(date=2021-05-06)
- \_ OpenLibraryAuthorMapping(date=2021-05-06)
- \_ OpenLibraryAuthors(date=2021-05-06)
- \_ UnmatchedMapped(date=2021-05-06)
- \_ RefsWithoutIdentifiers(date=2021-05-06)
- \_ RefsWithUnstructured(date=2021-05-06)
- \_ Refs(date=2021-05-06)
- \_ BrefZipPMID(date=2021-05-06)
- \_ RefsPMID(date=2021-05-06)
- \_ RefsWithUnstructured(date=2021-05-06)
- \_ Refs(date=2021-05-06)
- \_ FatcatPMID(date=2021-05-06)
- \_ ReleaseExportReduced(date=2021-05-06)
- \_ ReleaseExportExpanded(date=2021-05-06)
- \_ BrefZipPMCID(date=2021-05-06)
- \_ RefsPMCID(date=2021-05-06)
- \_ RefsWithUnstructured(date=2021-05-06)
- \_ Refs(date=2021-05-06)
- \_ FatcatPMCID(date=2021-05-06)
- \_ ReleaseExportReduced(date=2021-05-06)
- \_ ReleaseExportExpanded(date=2021-05-06)
- \_ BrefZipArxiv(date=2021-05-06)
- \_ RefsArxiv(date=2021-05-06)
- \_ RefsWithUnstructured(date=2021-05-06)
- \_ Refs(date=2021-05-06)
- \_ FatcatArxiv(date=2021-05-06)
- \_ ReleaseExportReduced(date=2021-05-06)
- \_ ReleaseExportExpanded(date=2021-05-06)
- \_ BrefZipFuzzy(date=2021-05-06, mapper=ts)
- \_ FatcatMapped(date=2021-05-06, mapper=ts)
- \_ ReleaseExportReduced(date=2021-05-06)
- \_ ReleaseExportExpanded(date=2021-05-06)
- \_ RefsMapped(date=2021-05-06, mapper=ts)
- \_ RefsToRelease(date=2021-05-06)
- \_ RefsWithUnstructured(date=2021-05-06)
- \_ Refs(date=2021-05-06)
-
-------------------------------------------------------------------------
-
Overview
--------
@@ -257,9 +206,6 @@ class Refcat(BaseTask):
return logging.getLogger('refcat')
-# ----8< Raw inputs; XXX: add wikipedia dump, mag, OCI, ...
-
-
class Refs(luigi.ExternalTask, Refcat):
"""
Compressed (zstd) references, as of 01/2021 containing ~1.8B docs; this
@@ -369,8 +315,8 @@ class OpenLibraryAuthors(luigi.ExternalTask, Refcat):
class RefsWithUnstructured(Refcat):
"""
- Augment refs with data from biblio.unstructured - do this first, so we can use it
- all subsequent steps.
+ Augment refs with data from biblio.unstructured - do this first, so we can
+ use it in all subsequent steps.
"""
def requires(self):
return Refs()
@@ -390,7 +336,7 @@ class RefsWithUnstructured(Refcat):
class ReleaseExportReduced(Refcat):
"""
- Reduce dataset size, stripping some heavy fields. 110min.
+ Reduce fatcat exported dataset size, stripping some heavy fields (110min).
"""
def requires(self):
return ReleaseExportExpanded()
@@ -412,10 +358,11 @@ class ReleaseExportReduced(Refcat):
class UnmatchedRefs(Refcat):
"""
File with not yet considered refs (e.g. no title, doi, ...); around
- 260,749,705.
+ 260,749,705. Note that this is a lower bound, since docs with titles may
+ not be matched as well.
Note, that this data contains refs, which have more information, just
- hidden in "unstructured" field. XXX: We'll come back to this later.
+ hidden in "unstructured" field. TODO: Parse all unparsed field data.
"""
def requires(self):
return RefsWithUnstructured()
@@ -439,6 +386,9 @@ class UnmatchedRefs(Refcat):
class RefsWithoutIdentifiers(Refcat):
+ """
+ All references, which do not have an identifier.
+ """
def requires(self):
return RefsWithUnstructured()
@@ -467,7 +417,8 @@ class RefsWithoutIdentifiers(Refcat):
class URLTabs(Refcat):
"""
- Extract (work ident, release ident, url, doc). 519m45.710s (about 55k docs/s).
+ Extract (work ident, release ident, url, doc) from refs (519m45.710s, about
+ 55k docs/s); sorted by url.
"""
def requires(self):
return RefsWithUnstructured()
@@ -490,7 +441,7 @@ class URLTabs(Refcat):
class URLTabsCleaned(Refcat):
"""
- URLTabs, cleaned, unsorted.
+ URLTabs, cleaned, sorted by url. Notes: https://is.gd/C7upZq
"""
def requires(self):
return URLTabs()
@@ -537,8 +488,8 @@ class URLList(Refcat):
#
-# Generate (key, doc) from refs
-# -----------------------------
+# Mapping tasks
+# -------------
#