aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/refcat/cli.py2
-rw-r--r--python/refcat/tasks.py75
2 files changed, 6 insertions, 71 deletions
diff --git a/python/refcat/cli.py b/python/refcat/cli.py
index 6279247..c589310 100644
--- a/python/refcat/cli.py
+++ b/python/refcat/cli.py
@@ -177,12 +177,14 @@ def config():
with open(settings.settings_file) as f:
print(f.read())
+
def run():
"""
For uniformity, have an extra subcommand for running a task.
"""
raise NotImplementedError()
+
def completion():
"""
TODO: completion snippet can live elsewhere.
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index 6b4a681..894e25a 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -196,6 +196,9 @@ try:
except ValueError:
date_from_tag = datetime.date.today()
+# Raw inputs are luigi.ExternalTask instances. We can use settings.ini entries
+# to configure paths for raw inputs.
+
class Refcat(BaseTask):
"""
@@ -222,7 +225,7 @@ class Refs(luigi.ExternalTask, Refcat):
might increase in a next version. This comes from a custom derivation from
an "heavy intermediate" format in a scholar pipeline.
- As of 07/2021, we have 2507793772 raw refs.
+ As of 07/2021, we have 2,507,793,772 raw refs.
"""
def output(self):
return luigi.LocalTarget(path=settings.REFS_FILE, format=Zstd)
@@ -282,15 +285,6 @@ class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat):
return luigi.LocalTarget(path=os.path.join(settings.WIKIPEDIA_CITATIONS, "minimal_dataset.json"))
-class OpenLibraryDump(luigi.ExternalTask, Refcat):
- """
- A solrdump exported version from a SOLR from:
- https://archive.org/details/olsolr8-2021-04-12; about 30M items.
- """
- def output(self):
- return luigi.LocalTarget(path=settings.OL_DUMP, format=Zstd)
-
-
class OpenLibraryEditions(luigi.ExternalTask, Refcat):
"""
Editions file (converted to zstd) https://openlibrary.org/developers/dumps.
@@ -299,15 +293,6 @@ class OpenLibraryEditions(luigi.ExternalTask, Refcat):
return luigi.LocalTarget(path=settings.OL_DUMP_EDITIONS, format=Zstd)
-class OpenLibraryWorks(luigi.ExternalTask, Refcat):
- """
- Works dump (converted to zstd), from
- https://openlibrary.org/developers/dumps.
- """
- def output(self):
- return luigi.LocalTarget(path=settings.OL_DUMP_WORKS, format=Zstd)
-
-
class OpenLibraryAuthors(luigi.ExternalTask, Refcat):
"""
Author dump (converted to zstd), from
@@ -905,58 +890,6 @@ class BrefZipFuzzy(Refcat):
#
-# WIP: Open Library
-# -----------------
-#
-
-
-class OpenLibraryEditionsByWork(Refcat):
- """
- DEPRECATED. Have editions keyed by work id, 9m5.037s.
- """
- def requires(self):
- return OpenLibraryEditions()
-
- def run(self):
- output = shellout("""
- zstdcat -T0 {input} |
- cut -f 5 |
- skate-map -skip-on-empty 1 -m ff -x 'works.0.key' |
- LC_ALL=C sort -T {tmpdir} -S25% -k1,1 |
- zstd -T0 -c > {output}
- """,
- tmpdir=self.tmpdir,
- input=self.input().path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-
-
-class OpenLibraryWorksSorted(Refcat):
- """
- DEPRECATED. Sorted by work id.
- """
- def requires(self):
- return OpenLibraryWorks()
-
- def run(self):
- output = shellout("""
- zstdcat -T0 {input} |
- cut -f 2,5 |
- LC_ALL=C sort -T {tmpdir} -S25% -k1,1 |
- zstd -T0 -c > {output}
- """,
- tmpdir=self.tmpdir,
- input=self.input().path)
-
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
-
-
-#
# Open Library Fuzzy matching (OL editions -> release, key extraction)
# --------------------------------------------------------------------
#