aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2022-01-14 14:53:50 +0100
committerMartin Czygan <martin.czygan@gmail.com>2022-01-14 14:53:50 +0100
commit7742d516be3480c1736ab030f77d3bf803e0f22f (patch)
treed36f619148a93f27ea28768543a02a6220bf451a
parent4539d4bdd5fb280cf8923148fed2a648b41ee574 (diff)
downloadrefcat-7742d516be3480c1736ab030f77d3bf803e0f22f.tar.gz
refcat-7742d516be3480c1736ab030f77d3bf803e0f22f.zip
tasks: update numbers
-rw-r--r--python/refcat/tasks.py7
1 files changed, 4 insertions, 3 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index 1795cbd..81b7b50 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -225,9 +225,10 @@ class Refs(luigi.ExternalTask, Refcat):
"""
Compressed (zstd) references, as of 01/2021 containing ~1.8B docs; this
might increase in a next version. This comes from a custom derivation from
- an "heavy intermediate" format in a scholar pipeline.
+ an "heavy intermediate" format from the scholar pipeline.
- As of 07/2021, we have 2,507,793,772 raw refs.
+ * as of 07/2021, we have 2,507,793,772 raw refs
+ * as of 01/2022, we have 2,781,453,198 raw refs
"""
def output(self):
return luigi.LocalTarget(path=settings.REFS_FILE, format=Zstd)
@@ -252,7 +253,7 @@ class WikipediaCitationsMinimalDataset(luigi.ExternalTask, Refcat):
$ parquet-tools cat --json minimal_dataset.parquet > minimal_dataset.json
- Contains (07/2021) around 29276667 rows.
+ Contains (07/2021) around 29,276,667 rows.
Rough id type distribution: