diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-05-06 20:13:20 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-05-06 20:13:20 +0200 |
commit | 6c327acbf5799dde9c153843ac3ba1471e88317c (patch) | |
tree | faf5144f4ed9ce1ec8115783f17cd4c8e7816fb7 /python | |
parent | b46c9d351aa0a2a5c3618a1420259d4605a9654e (diff) | |
download | refcat-6c327acbf5799dde9c153843ac3ba1471e88317c.tar.gz refcat-6c327acbf5799dde9c153843ac3ba1471e88317c.zip |
start to cleanup tasks
Diffstat (limited to 'python')
-rw-r--r-- | python/refcat/tasks.py | 47 |
1 files changed, 17 insertions, 30 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 182a51f..4c6723c 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -139,7 +139,7 @@ class Refcat(BaseTask): BASE = settings.BASE TAG = 'refcat' - date = luigi.DateParameter(default=datetime.date(2021, 2, 20), description="a versioning help, change this manually") + date = luigi.DateParameter(default=datetime.date(2021, 5, 6), description="a versioning help, change this manually") tmpdir = luigi.Parameter(default="/magna/tmp", description="set tempdir", significant=False) n = luigi.IntParameter(default=multiprocessing.cpu_count(), significant=False) @@ -156,7 +156,8 @@ class Refcat(BaseTask): class Refs(luigi.ExternalTask, Refcat): """ - Compressed (zstd) references, as of 01/2021 containing ~1.8B docs. + Compressed (zstd) references, as of 01/2021 containing ~1.8B docs; this + might increase in a next version. """ def output(self): return luigi.LocalTarget(path=settings.REFS_FILE, format=Zstd) @@ -164,7 +165,7 @@ class Refs(luigi.ExternalTask, Refcat): class ReleaseExportExpanded(luigi.ExternalTask, Refcat): """ - Release export, zstd version. + Fatcat release export, zstd version, from e.g. https://archive.org/details/fatcat_snapshots_and_exports """ def output(self): return luigi.LocalTarget(path=settings.RELEASE_EXPORT_EXPANDED_FILE, format=Zstd) @@ -180,9 +181,9 @@ class MAGPapers(luigi.ExternalTask, Refcat): class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat): """ - From archive.org/details/wikipedia_citations_2020-07-14 (Wikipedia + From https://archive.org/details/wikipedia_citations_2020-07-14 (Wikipedia Citations: A comprehensive dataset of citations with identifiers extracted - from English Wikipedia). + from English Wikipedia); http://doi.org/10.5281/zenodo.3940692. Dataset contains parquet, but we want JSON here: @@ -191,17 +192,22 @@ class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat): def output(self): return luigi.LocalTarget(path=os.path.join(settings.WIKIPEDIA_CITATIONS, "minimal_dataset.json")) -class OpenLibraryDump(luigi.ExternalTask, Refcat): +class OpenLibraryDump(luigi.ExternalTask, Refcat): + """ + A solrdump exported version from a SOLR from: + https://archive.org/details/olsolr8-2021-04-12; about 30M items. + """ def output(self): return luigi.LocalTarget(path=settings.OL_DUMP, format=Zstd) + # ----8< Derivations class RefsWithUnstructured(Refcat): """ - Augment refs with data from unstructured. Do this first, so we can use it + Augment refs with data from biblio.unstructured. Do this first, so we can use it all subsequent steps. """ def requires(self): @@ -211,7 +217,7 @@ class RefsWithUnstructured(Refcat): output = shellout(""" zstdcat -T0 {input} | skate-from-unstructured | - zstd -T0 -c9 > {output} + zstd -T0 -c > {output} """, input=self.input().path) luigi.LocalTarget(output).move(self.output().path) @@ -222,7 +228,7 @@ class RefsWithUnstructured(Refcat): class ReleaseExportReduced(Refcat): """ - Reduce dataset size, stripping fields. + Reduce dataset size, stripping some heave fields. """ def requires(self): return ReleaseExportExpanded() @@ -240,25 +246,6 @@ class ReleaseExportReduced(Refcat): return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) -class ReleaseExportTitleOnly(Refcat): - """ - Reduce dataset size, only keep title. - """ - def requires(self): - return ReleaseExportReduced() - - def run(self): - output = shellout(""" - zstdcat -T0 {input} | - parallel --block 10M -j 16 --pipe "jq -rc '{{\"title\": .title}}'" | - zstd -T0 -c9 > {output} - """, - input=self.input().path) - luigi.LocalTarget(output).move(self.output().path) - - def output(self): - return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) - class URLTabs(Refcat): """ @@ -1431,8 +1418,10 @@ class RefsSortedIdent(Refcat): def output(self): return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) + # OL + class WithISBN(Refcat): """ Keeps converted refs with isbn. @@ -1475,5 +1464,3 @@ class OpenLibraryWorks(Refcat): def output(self): return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) - - |