aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-05-06 20:13:20 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-05-06 20:13:20 +0200
commit6c327acbf5799dde9c153843ac3ba1471e88317c (patch)
treefaf5144f4ed9ce1ec8115783f17cd4c8e7816fb7 /python
parentb46c9d351aa0a2a5c3618a1420259d4605a9654e (diff)
downloadrefcat-6c327acbf5799dde9c153843ac3ba1471e88317c.tar.gz
refcat-6c327acbf5799dde9c153843ac3ba1471e88317c.zip
start to cleanup tasks
Diffstat (limited to 'python')
-rw-r--r--python/refcat/tasks.py47
1 files changed, 17 insertions, 30 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index 182a51f..4c6723c 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -139,7 +139,7 @@ class Refcat(BaseTask):
BASE = settings.BASE
TAG = 'refcat'
- date = luigi.DateParameter(default=datetime.date(2021, 2, 20), description="a versioning help, change this manually")
+ date = luigi.DateParameter(default=datetime.date(2021, 5, 6), description="a versioning help, change this manually")
tmpdir = luigi.Parameter(default="/magna/tmp", description="set tempdir", significant=False)
n = luigi.IntParameter(default=multiprocessing.cpu_count(), significant=False)
@@ -156,7 +156,8 @@ class Refcat(BaseTask):
class Refs(luigi.ExternalTask, Refcat):
"""
- Compressed (zstd) references, as of 01/2021 containing ~1.8B docs.
+ Compressed (zstd) references, as of 01/2021 containing ~1.8B docs; this
+ might increase in a next version.
"""
def output(self):
return luigi.LocalTarget(path=settings.REFS_FILE, format=Zstd)
@@ -164,7 +165,7 @@ class Refs(luigi.ExternalTask, Refcat):
class ReleaseExportExpanded(luigi.ExternalTask, Refcat):
"""
- Release export, zstd version.
+ Fatcat release export, zstd version, from e.g. https://archive.org/details/fatcat_snapshots_and_exports
"""
def output(self):
return luigi.LocalTarget(path=settings.RELEASE_EXPORT_EXPANDED_FILE, format=Zstd)
@@ -180,9 +181,9 @@ class MAGPapers(luigi.ExternalTask, Refcat):
class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat):
"""
- From archive.org/details/wikipedia_citations_2020-07-14 (Wikipedia
+ From https://archive.org/details/wikipedia_citations_2020-07-14 (Wikipedia
Citations: A comprehensive dataset of citations with identifiers extracted
- from English Wikipedia).
+ from English Wikipedia); http://doi.org/10.5281/zenodo.3940692.
Dataset contains parquet, but we want JSON here:
@@ -191,17 +192,22 @@ class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat):
def output(self):
return luigi.LocalTarget(path=os.path.join(settings.WIKIPEDIA_CITATIONS, "minimal_dataset.json"))
-class OpenLibraryDump(luigi.ExternalTask, Refcat):
+class OpenLibraryDump(luigi.ExternalTask, Refcat):
+ """
+ A solrdump exported version from a SOLR from:
+ https://archive.org/details/olsolr8-2021-04-12; about 30M items.
+ """
def output(self):
return luigi.LocalTarget(path=settings.OL_DUMP, format=Zstd)
+
# ----8< Derivations
class RefsWithUnstructured(Refcat):
"""
- Augment refs with data from unstructured. Do this first, so we can use it
+ Augment refs with data from biblio.unstructured. Do this first, so we can use it
all subsequent steps.
"""
def requires(self):
@@ -211,7 +217,7 @@ class RefsWithUnstructured(Refcat):
output = shellout("""
zstdcat -T0 {input} |
skate-from-unstructured |
- zstd -T0 -c9 > {output}
+ zstd -T0 -c > {output}
""",
input=self.input().path)
luigi.LocalTarget(output).move(self.output().path)
@@ -222,7 +228,7 @@ class RefsWithUnstructured(Refcat):
class ReleaseExportReduced(Refcat):
"""
- Reduce dataset size, stripping fields.
+ Reduce dataset size, stripping some heave fields.
"""
def requires(self):
return ReleaseExportExpanded()
@@ -240,25 +246,6 @@ class ReleaseExportReduced(Refcat):
return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
-class ReleaseExportTitleOnly(Refcat):
- """
- Reduce dataset size, only keep title.
- """
- def requires(self):
- return ReleaseExportReduced()
-
- def run(self):
- output = shellout("""
- zstdcat -T0 {input} |
- parallel --block 10M -j 16 --pipe "jq -rc '{{\"title\": .title}}'" |
- zstd -T0 -c9 > {output}
- """,
- input=self.input().path)
- luigi.LocalTarget(output).move(self.output().path)
-
- def output(self):
- return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
-
class URLTabs(Refcat):
"""
@@ -1431,8 +1418,10 @@ class RefsSortedIdent(Refcat):
def output(self):
return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
+
# OL
+
class WithISBN(Refcat):
"""
Keeps converted refs with isbn.
@@ -1475,5 +1464,3 @@ class OpenLibraryWorks(Refcat):
def output(self):
return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
-
-