From 7742d516be3480c1736ab030f77d3bf803e0f22f Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 14 Jan 2022 14:53:50 +0100 Subject: tasks: update numbers --- python/refcat/tasks.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 1795cbd..81b7b50 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -225,9 +225,10 @@ class Refs(luigi.ExternalTask, Refcat): """ Compressed (zstd) references, as of 01/2021 containing ~1.8B docs; this might increase in a next version. This comes from a custom derivation from - an "heavy intermediate" format in a scholar pipeline. + an "heavy intermediate" format from the scholar pipeline. - As of 07/2021, we have 2,507,793,772 raw refs. + * as of 07/2021, we have 2,507,793,772 raw refs + * as of 01/2022, we have 2,781,453,198 raw refs """ def output(self): return luigi.LocalTarget(path=settings.REFS_FILE, format=Zstd) @@ -252,7 +253,7 @@ class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat): $ parquet-tools cat --json minimal_dataset.parquet > minimal_dataset.json - Contains (07/2021) around 29276667 rows. + Contains (07/2021) around 29,276,667 rows. Rough id type distribution: -- cgit v1.2.3