aboutsummaryrefslogtreecommitdiffstats
path: root/sql/stats/README.md
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-24 21:01:25 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-24 21:01:25 -0800
commit1b74e8a4dee21bd260040dad8072e4fb48456b3c (patch)
tree36a5c8f263ff7a1637754816c0d76a226e77532d /sql/stats/README.md
parentbe8f1d134681caaa15485246b65551a67e5bd5a5 (diff)
downloadsandcrawler-1b74e8a4dee21bd260040dad8072e4fb48456b3c.tar.gz
sandcrawler-1b74e8a4dee21bd260040dad8072e4fb48456b3c.zip
recent sandcrawler-db / ingest stats (interesting)
Diffstat (limited to 'sql/stats/README.md')
-rw-r--r--sql/stats/README.md6
1 files changed, 6 insertions, 0 deletions
diff --git a/sql/stats/README.md b/sql/stats/README.md
index ea61fa0..52642f6 100644
--- a/sql/stats/README.md
+++ b/sql/stats/README.md
@@ -41,6 +41,10 @@ Total and unique-by-sha1 counts:
SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+mimetype counts:
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC;
+
Processed or not:
# TODO:
@@ -71,6 +75,8 @@ Requests by source:
SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25;
+
Uncrawled requests by source:
# TODO: verify this?