aboutsummaryrefslogtreecommitdiffstats
path: root/sql/stats/README.md
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-09-15 00:20:57 -0700
committerBryan Newbold <bnewbold@archive.org>2020-09-15 00:20:57 -0700
commitc1f60b1e31c8ff3b25c7cdcd442ccd91512d1db8 (patch)
treea5c89b810c74e65a67c66b412daffbb555792016 /sql/stats/README.md
parentc714ecdcd8aa8bb39b1b46860944b6cace7f5077 (diff)
downloadsandcrawler-c1f60b1e31c8ff3b25c7cdcd442ccd91512d1db8.tar.gz
sandcrawler-c1f60b1e31c8ff3b25c7cdcd442ccd91512d1db8.zip
updated sandcrawler-db stats
Diffstat (limited to 'sql/stats/README.md')
-rw-r--r--sql/stats/README.md12
1 files changed, 6 insertions, 6 deletions
diff --git a/sql/stats/README.md b/sql/stats/README.md
index 89deec2..2e9eae5 100644
--- a/sql/stats/README.md
+++ b/sql/stats/README.md
@@ -29,7 +29,7 @@ Counts and total file size:
Top mimetypes:
- SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10;
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 20;
Missing full metadata:
@@ -43,11 +43,7 @@ Total and unique-by-sha1 counts:
mimetype counts:
- SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC;
-
-Processed or not:
-
- # TODO:
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 25;
## GROBID
@@ -107,6 +103,10 @@ Ingest result by status:
SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25;
+Failed ingest by terminal status code:
+
+ SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 25;
+
## Fatcat Files
Count of PDF files that GROBID processed and matched to a release (via