diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2022-01-14 14:53:50 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2022-01-14 14:53:50 +0100 |
commit | 7742d516be3480c1736ab030f77d3bf803e0f22f (patch) | |
tree | d36f619148a93f27ea28768543a02a6220bf451a | |
parent | 4539d4bdd5fb280cf8923148fed2a648b41ee574 (diff) | |
download | refcat-7742d516be3480c1736ab030f77d3bf803e0f22f.tar.gz refcat-7742d516be3480c1736ab030f77d3bf803e0f22f.zip |
tasks: update numbers
-rw-r--r-- | python/refcat/tasks.py | 7 |
1 files changed, 4 insertions, 3 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index 1795cbd..81b7b50 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -225,9 +225,10 @@ class Refs(luigi.ExternalTask, Refcat): """ Compressed (zstd) references, as of 01/2021 containing ~1.8B docs; this might increase in a next version. This comes from a custom derivation from - an "heavy intermediate" format in a scholar pipeline. + an "heavy intermediate" format from the scholar pipeline. - As of 07/2021, we have 2,507,793,772 raw refs. + * as of 07/2021, we have 2,507,793,772 raw refs + * as of 01/2022, we have 2,781,453,198 raw refs """ def output(self): return luigi.LocalTarget(path=settings.REFS_FILE, format=Zstd) @@ -252,7 +253,7 @@ class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat): $ parquet-tools cat --json minimal_dataset.parquet > minimal_dataset.json - Contains (07/2021) around 29276667 rows. + Contains (07/2021) around 29,276,667 rows. Rough id type distribution: |