diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/refcat/tasks.py | 79 |
1 files changed, 15 insertions, 64 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index c798272..42fa924 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 + """ Set of luigi tasks to derive a citation graph. @@ -57,58 +58,6 @@ Set of luigi tasks to derive a citation graph. ------------------------------------------------------------------------ -Deps (2021-06-07) for final "bref" file: - - \_ Bref(date=2021-05-06) - \_ BrefZipDOI(date=2021-05-06) - \_ FatcatDOI(date=2021-05-06) - \_ ReleaseExportReduced(date=2021-05-06) - \_ ReleaseExportExpanded(date=2021-05-06) - \_ RefsDOI(date=2021-05-06) - \_ RefsWithUnstructured(date=2021-05-06) - \_ Refs(date=2021-05-06) - \_ BrefZipOpenLibrary(date=2021-05-06) - \_ OpenLibraryEditionsMapped(date=2021-05-06, mapper=ts) - \_ OpenLibraryEditionsToRelease(date=2021-05-06) - \_ OpenLibraryEditions(date=2021-05-06) - \_ OpenLibraryAuthorMapping(date=2021-05-06) - \_ OpenLibraryAuthors(date=2021-05-06) - \_ UnmatchedMapped(date=2021-05-06) - \_ RefsWithoutIdentifiers(date=2021-05-06) - \_ RefsWithUnstructured(date=2021-05-06) - \_ Refs(date=2021-05-06) - \_ BrefZipPMID(date=2021-05-06) - \_ RefsPMID(date=2021-05-06) - \_ RefsWithUnstructured(date=2021-05-06) - \_ Refs(date=2021-05-06) - \_ FatcatPMID(date=2021-05-06) - \_ ReleaseExportReduced(date=2021-05-06) - \_ ReleaseExportExpanded(date=2021-05-06) - \_ BrefZipPMCID(date=2021-05-06) - \_ RefsPMCID(date=2021-05-06) - \_ RefsWithUnstructured(date=2021-05-06) - \_ Refs(date=2021-05-06) - \_ FatcatPMCID(date=2021-05-06) - \_ ReleaseExportReduced(date=2021-05-06) - \_ ReleaseExportExpanded(date=2021-05-06) - \_ BrefZipArxiv(date=2021-05-06) - \_ RefsArxiv(date=2021-05-06) - \_ RefsWithUnstructured(date=2021-05-06) - \_ Refs(date=2021-05-06) - \_ FatcatArxiv(date=2021-05-06) - \_ ReleaseExportReduced(date=2021-05-06) - \_ ReleaseExportExpanded(date=2021-05-06) - \_ BrefZipFuzzy(date=2021-05-06, mapper=ts) - \_ FatcatMapped(date=2021-05-06, mapper=ts) - \_ ReleaseExportReduced(date=2021-05-06) - \_ ReleaseExportExpanded(date=2021-05-06) - \_ RefsMapped(date=2021-05-06, mapper=ts) - \_ RefsToRelease(date=2021-05-06) - \_ RefsWithUnstructured(date=2021-05-06) - \_ Refs(date=2021-05-06) - ------------------------------------------------------------------------- - Overview -------- @@ -257,9 +206,6 @@ class Refcat(BaseTask): return logging.getLogger('refcat') -# ----8< Raw inputs; XXX: add wikipedia dump, mag, OCI, ... - - class Refs(luigi.ExternalTask, Refcat): """ Compressed (zstd) references, as of 01/2021 containing ~1.8B docs; this @@ -369,8 +315,8 @@ class OpenLibraryAuthors(luigi.ExternalTask, Refcat): class RefsWithUnstructured(Refcat): """ - Augment refs with data from biblio.unstructured - do this first, so we can use it - all subsequent steps. + Augment refs with data from biblio.unstructured - do this first, so we can + use it in all subsequent steps. """ def requires(self): return Refs() @@ -390,7 +336,7 @@ class RefsWithUnstructured(Refcat): class ReleaseExportReduced(Refcat): """ - Reduce dataset size, stripping some heavy fields. 110min. + Reduce fatcat exported dataset size, stripping some heavy fields (110min). """ def requires(self): return ReleaseExportExpanded() @@ -412,10 +358,11 @@ class ReleaseExportReduced(Refcat): class UnmatchedRefs(Refcat): """ File with not yet considered refs (e.g. no title, doi, ...); around - 260,749,705. + 260,749,705. Note that this is a lower bound, since docs with titles may + not be matched as well. Note, that this data contains refs, which have more information, just - hidden in "unstructured" field. XXX: We'll come back to this later. + hidden in "unstructured" field. TODO: Parse all unparsed field data. """ def requires(self): return RefsWithUnstructured() @@ -439,6 +386,9 @@ class UnmatchedRefs(Refcat): class RefsWithoutIdentifiers(Refcat): + """ + All references, which do not have an identifier. + """ def requires(self): return RefsWithUnstructured() @@ -467,7 +417,8 @@ class RefsWithoutIdentifiers(Refcat): class URLTabs(Refcat): """ - Extract (work ident, release ident, url, doc). 519m45.710s (about 55k docs/s). + Extract (work ident, release ident, url, doc) from refs (519m45.710s, about + 55k docs/s); sorted by url. """ def requires(self): return RefsWithUnstructured() @@ -490,7 +441,7 @@ class URLTabs(Refcat): class URLTabsCleaned(Refcat): """ - URLTabs, cleaned, unsorted. + URLTabs, cleaned, sorted by url. Notes: https://is.gd/C7upZq """ def requires(self): return URLTabs() @@ -537,8 +488,8 @@ class URLList(Refcat): # -# Generate (key, doc) from refs -# ----------------------------- +# Mapping tasks +# ------------- # |