diff options
-rw-r--r-- | python/refcat/tasks.py | 7 |
1 files changed, 4 insertions, 3 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 1795cbd..81b7b50 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -225,9 +225,10 @@ class Refs(luigi.ExternalTask, Refcat): """ Compressed (zstd) references, as of 01/2021 containing ~1.8B docs; this might increase in a next version. This comes from a custom derivation from - an "heavy intermediate" format in a scholar pipeline. + an "heavy intermediate" format from the scholar pipeline. - As of 07/2021, we have 2,507,793,772 raw refs. + * as of 07/2021, we have 2,507,793,772 raw refs + * as of 01/2022, we have 2,781,453,198 raw refs """ def output(self): return luigi.LocalTarget(path=settings.REFS_FILE, format=Zstd) @@ -252,7 +253,7 @@ class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat): $ parquet-tools cat --json minimal_dataset.parquet > minimal_dataset.json - Contains (07/2021) around 29276667 rows. + Contains (07/2021) around 29,276,667 rows. Rough id type distribution: |