From 1b74e8a4dee21bd260040dad8072e4fb48456b3c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 24 Feb 2020 21:01:25 -0800 Subject: recent sandcrawler-db / ingest stats (interesting) --- sql/stats/README.md | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'sql/stats/README.md') diff --git a/sql/stats/README.md b/sql/stats/README.md index ea61fa0..52642f6 100644 --- a/sql/stats/README.md +++ b/sql/stats/README.md @@ -41,6 +41,10 @@ Total and unique-by-sha1 counts: SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC; + Processed or not: # TODO: @@ -71,6 +75,8 @@ Requests by source: SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; + Uncrawled requests by source: # TODO: verify this? -- cgit v1.2.3