diff options
| author | Martin Czygan <martin.czygan@gmail.com> | 2021-05-06 20:13:20 +0200 | 
|---|---|---|
| committer | Martin Czygan <martin.czygan@gmail.com> | 2021-05-06 20:13:20 +0200 | 
| commit | 6c327acbf5799dde9c153843ac3ba1471e88317c (patch) | |
| tree | faf5144f4ed9ce1ec8115783f17cd4c8e7816fb7 /python | |
| parent | b46c9d351aa0a2a5c3618a1420259d4605a9654e (diff) | |
| download | refcat-6c327acbf5799dde9c153843ac3ba1471e88317c.tar.gz refcat-6c327acbf5799dde9c153843ac3ba1471e88317c.zip | |
start to cleanup tasks
Diffstat (limited to 'python')
| -rw-r--r-- | python/refcat/tasks.py | 47 | 
1 files changed, 17 insertions, 30 deletions
| diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 182a51f..4c6723c 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -139,7 +139,7 @@ class Refcat(BaseTask):      BASE = settings.BASE      TAG = 'refcat' -    date = luigi.DateParameter(default=datetime.date(2021, 2, 20), description="a versioning help, change this manually") +    date = luigi.DateParameter(default=datetime.date(2021, 5, 6), description="a versioning help, change this manually")      tmpdir = luigi.Parameter(default="/magna/tmp", description="set tempdir", significant=False)      n = luigi.IntParameter(default=multiprocessing.cpu_count(), significant=False) @@ -156,7 +156,8 @@ class Refcat(BaseTask):  class Refs(luigi.ExternalTask, Refcat):      """ -    Compressed (zstd) references, as of 01/2021 containing ~1.8B docs. +    Compressed (zstd) references, as of 01/2021 containing ~1.8B docs; this +    might increase in a next version.      """      def output(self):          return luigi.LocalTarget(path=settings.REFS_FILE, format=Zstd) @@ -164,7 +165,7 @@ class Refs(luigi.ExternalTask, Refcat):  class ReleaseExportExpanded(luigi.ExternalTask, Refcat):      """ -    Release export, zstd version. +    Fatcat release export, zstd version, from e.g. https://archive.org/details/fatcat_snapshots_and_exports      """      def output(self):          return luigi.LocalTarget(path=settings.RELEASE_EXPORT_EXPANDED_FILE, format=Zstd) @@ -180,9 +181,9 @@ class MAGPapers(luigi.ExternalTask, Refcat):  class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat):      """ -    From archive.org/details/wikipedia_citations_2020-07-14 (Wikipedia +    From https://archive.org/details/wikipedia_citations_2020-07-14 (Wikipedia      Citations: A comprehensive dataset of citations with identifiers extracted -    from English Wikipedia). +    from English Wikipedia); http://doi.org/10.5281/zenodo.3940692.      Dataset contains parquet, but we want JSON here: @@ -191,17 +192,22 @@ class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat):      def output(self):          return luigi.LocalTarget(path=os.path.join(settings.WIKIPEDIA_CITATIONS, "minimal_dataset.json")) -class OpenLibraryDump(luigi.ExternalTask, Refcat): +class OpenLibraryDump(luigi.ExternalTask, Refcat): +    """ +    A solrdump exported version from a SOLR from: +    https://archive.org/details/olsolr8-2021-04-12; about 30M items. +    """      def output(self):          return luigi.LocalTarget(path=settings.OL_DUMP, format=Zstd) +  # ----8< Derivations  class RefsWithUnstructured(Refcat):      """ -    Augment refs with data from unstructured. Do this first, so we can use it +    Augment refs with data from biblio.unstructured. Do this first, so we can use it      all subsequent steps.      """      def requires(self): @@ -211,7 +217,7 @@ class RefsWithUnstructured(Refcat):          output = shellout("""                            zstdcat -T0 {input} |                            skate-from-unstructured | -                          zstd -T0 -c9 > {output} +                          zstd -T0 -c > {output}                            """,                            input=self.input().path)          luigi.LocalTarget(output).move(self.output().path) @@ -222,7 +228,7 @@ class RefsWithUnstructured(Refcat):  class ReleaseExportReduced(Refcat):      """ -    Reduce dataset size, stripping fields. +    Reduce dataset size, stripping some heave fields.      """      def requires(self):          return ReleaseExportExpanded() @@ -240,25 +246,6 @@ class ReleaseExportReduced(Refcat):          return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) -class ReleaseExportTitleOnly(Refcat): -    """ -    Reduce dataset size, only keep title. -    """ -    def requires(self): -        return ReleaseExportReduced() - -    def run(self): -        output = shellout(""" -                          zstdcat -T0 {input} | -                          parallel --block 10M -j 16 --pipe "jq -rc '{{\"title\": .title}}'" | -                          zstd -T0 -c9 > {output} -                          """, -                          input=self.input().path) -        luigi.LocalTarget(output).move(self.output().path) - -    def output(self): -        return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) -  class URLTabs(Refcat):      """ @@ -1431,8 +1418,10 @@ class RefsSortedIdent(Refcat):      def output(self):          return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) +  # OL +  class WithISBN(Refcat):      """      Keeps converted refs with isbn. @@ -1475,5 +1464,3 @@ class OpenLibraryWorks(Refcat):      def output(self):          return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) - - | 
