diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-06-02 21:00:41 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-06-02 21:00:41 +0200 |
commit | f3631ec35f772085959c9d58a998486d897ca95b (patch) | |
tree | d255336b10ed03ff30d21e716f211d09d197b5ef | |
parent | 36671f1f20211c2fa84f03dc2e1187d2cc8f8551 (diff) | |
download | refcat-f3631ec35f772085959c9d58a998486d897ca95b.tar.gz refcat-f3631ec35f772085959c9d58a998486d897ca95b.zip |
update docs
-rw-r--r-- | python/refcat/tasks.py | 18 |
1 files changed, 11 insertions, 7 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 48ec180..1f56e84 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -45,6 +45,12 @@ Overview * raw input "tasks" as luigi.ExternalTask * derivation +Note: We mostly use some shell pipelines with UNIX and custom tools (see: skate); we +may get rid of this "python layer" altogether, if we converged on what to +build. The most common pattern is map-reduce, e.g. derive a key from docs, +combine the results from e.g. two such key extractions and apply some +reduction, e.g. output schema generation. + # Various schema * release (fatcat database export) @@ -52,6 +58,7 @@ Overview * OL editions (open library editions) * OL authors (open library authors) * wiki (a particular wikipedia reference export) +* biblioref (or bref, the schema we store the citation graph in, ATM) Some operations, e.g. "fuzzy verification" require both compared documents to be release entities. This means, that we need to convert different formats into @@ -202,7 +209,6 @@ class OpenLibraryAuthors(luigi.ExternalTask, Refcat): # --------------------------------------- # - class RefsWithUnstructured(Refcat): """ Augment refs with data from biblio.unstructured - do this first, so we can use it @@ -249,6 +255,9 @@ class UnmatchedRefs(Refcat): """ File with not yet considered refs (e.g. no title, doi, ...); around 260,749,705. + + Note, that this data contains refs, which have more information, just + hidden in unstructured. """ def requires(self): return RefsWithUnstructured() @@ -276,7 +285,6 @@ class UnmatchedRefs(Refcat): # -------------------------------- # - class URLTabs(Refcat): """ Extract (work ident, release ident, url, doc). 519m45.710s (about 55k docs/s). @@ -302,11 +310,7 @@ class URLTabs(Refcat): class URLList(Refcat): """ - List of cleaned, unique URLs from refs. - - For CDX lookup, we just want ^http, so: - - $ zstdcat -T0 date-2021-05-06.tsv.zst | grep ^http > fatcat-refs-urllist-2021-05-06.tsv + List of mostly cleaned, unique URLs from refs. """ def requires(self): return URLTabs() |