aboutsummaryrefslogtreecommitdiffstats
path: root/sql/stats/README.md
diff options
context:
space:
mode:
Diffstat (limited to 'sql/stats/README.md')
-rw-r--r--sql/stats/README.md12
1 files changed, 6 insertions, 6 deletions
diff --git a/sql/stats/README.md b/sql/stats/README.md
index 89deec2..2e9eae5 100644
--- a/sql/stats/README.md
+++ b/sql/stats/README.md
@@ -29,7 +29,7 @@ Counts and total file size:
Top mimetypes:
- SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10;
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 20;
Missing full metadata:
@@ -43,11 +43,7 @@ Total and unique-by-sha1 counts:
mimetype counts:
- SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC;
-
-Processed or not:
-
- # TODO:
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 25;
## GROBID
@@ -107,6 +103,10 @@ Ingest result by status:
SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25;
+Failed ingest by terminal status code:
+
+ SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 25;
+
## Fatcat Files
Count of PDF files that GROBID processed and matched to a release (via