aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/refcat/tasks.py7
1 files changed, 4 insertions, 3 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index 1795cbd..81b7b50 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -225,9 +225,10 @@ class Refs(luigi.ExternalTask, Refcat):
"""
Compressed (zstd) references, as of 01/2021 containing ~1.8B docs; this
might increase in a next version. This comes from a custom derivation from
- an "heavy intermediate" format in a scholar pipeline.
+ an "heavy intermediate" format from the scholar pipeline.
- As of 07/2021, we have 2,507,793,772 raw refs.
+ * as of 07/2021, we have 2,507,793,772 raw refs
+ * as of 01/2022, we have 2,781,453,198 raw refs
"""
def output(self):
return luigi.LocalTarget(path=settings.REFS_FILE, format=Zstd)
@@ -252,7 +253,7 @@ class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat):
$ parquet-tools cat --json minimal_dataset.parquet > minimal_dataset.json
- Contains (07/2021) around 29276667 rows.
+ Contains (07/2021) around 29,276,667 rows.
Rough id type distribution: