diff options
-rw-r--r-- | python/refcat/cli.py | 2 | ||||
-rw-r--r-- | python/refcat/tasks.py | 75 |
2 files changed, 6 insertions, 71 deletions
diff --git a/python/refcat/cli.py b/python/refcat/cli.py index 6279247..c589310 100644 --- a/python/refcat/cli.py +++ b/python/refcat/cli.py @@ -177,12 +177,14 @@ def config(): with open(settings.settings_file) as f: print(f.read()) + def run(): """ For uniformity, have an extra subcommand for running a task. """ raise NotImplementedError() + def completion(): """ TODO: completion snippet can live elsewhere. diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 6b4a681..894e25a 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -196,6 +196,9 @@ try: except ValueError: date_from_tag = datetime.date.today() +# Raw inputs are luigi.ExternalTask instances. We can use settings.ini entries +# to configure paths for raw inputs. + class Refcat(BaseTask): """ @@ -222,7 +225,7 @@ class Refs(luigi.ExternalTask, Refcat): might increase in a next version. This comes from a custom derivation from an "heavy intermediate" format in a scholar pipeline. - As of 07/2021, we have 2507793772 raw refs. + As of 07/2021, we have 2,507,793,772 raw refs. """ def output(self): return luigi.LocalTarget(path=settings.REFS_FILE, format=Zstd) @@ -282,15 +285,6 @@ class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat): return luigi.LocalTarget(path=os.path.join(settings.WIKIPEDIA_CITATIONS, "minimal_dataset.json")) -class OpenLibraryDump(luigi.ExternalTask, Refcat): - """ - A solrdump exported version from a SOLR from: - https://archive.org/details/olsolr8-2021-04-12; about 30M items. - """ - def output(self): - return luigi.LocalTarget(path=settings.OL_DUMP, format=Zstd) - - class OpenLibraryEditions(luigi.ExternalTask, Refcat): """ Editions file (converted to zstd) https://openlibrary.org/developers/dumps. @@ -299,15 +293,6 @@ class OpenLibraryEditions(luigi.ExternalTask, Refcat): return luigi.LocalTarget(path=settings.OL_DUMP_EDITIONS, format=Zstd) -class OpenLibraryWorks(luigi.ExternalTask, Refcat): - """ - Works dump (converted to zstd), from - https://openlibrary.org/developers/dumps. - """ - def output(self): - return luigi.LocalTarget(path=settings.OL_DUMP_WORKS, format=Zstd) - - class OpenLibraryAuthors(luigi.ExternalTask, Refcat): """ Author dump (converted to zstd), from @@ -905,58 +890,6 @@ class BrefZipFuzzy(Refcat): # -# WIP: Open Library -# ----------------- -# - - -class OpenLibraryEditionsByWork(Refcat): - """ - DEPRECATED. Have editions keyed by work id, 9m5.037s. - """ - def requires(self): - return OpenLibraryEditions() - - def run(self): - output = shellout(""" - zstdcat -T0 {input} | - cut -f 5 | - skate-map -skip-on-empty 1 -m ff -x 'works.0.key' | - LC_ALL=C sort -T {tmpdir} -S25% -k1,1 | - zstd -T0 -c > {output} - """, - tmpdir=self.tmpdir, - input=self.input().path) - luigi.LocalTarget(output).move(self.output().path) - - def output(self): - return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) - - -class OpenLibraryWorksSorted(Refcat): - """ - DEPRECATED. Sorted by work id. - """ - def requires(self): - return OpenLibraryWorks() - - def run(self): - output = shellout(""" - zstdcat -T0 {input} | - cut -f 2,5 | - LC_ALL=C sort -T {tmpdir} -S25% -k1,1 | - zstd -T0 -c > {output} - """, - tmpdir=self.tmpdir, - input=self.input().path) - - luigi.LocalTarget(output).move(self.output().path) - - def output(self): - return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) - - -# # Open Library Fuzzy matching (OL editions -> release, key extraction) # -------------------------------------------------------------------- # |