diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-02-03 21:50:51 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-03 21:50:51 -0800 |
commit | 5f9e7fd4c89df98ed90be9629d3dc6c201b42a02 (patch) | |
tree | 01304b674caa1aaff8dba945f35ceb946c1a024d /sql | |
parent | e865b395f945e5d97a99ce269f23957cae01feaa (diff) | |
download | sandcrawler-5f9e7fd4c89df98ed90be9629d3dc6c201b42a02.tar.gz sandcrawler-5f9e7fd4c89df98ed90be9629d3dc6c201b42a02.zip |
more random sandcrawler-db queries
Diffstat (limited to 'sql')
-rw-r--r-- | sql/README.md | 32 | ||||
-rw-r--r-- | sql/random_queries.md | 62 |
2 files changed, 62 insertions, 32 deletions
diff --git a/sql/README.md b/sql/README.md index d3e838c..42dba31 100644 --- a/sql/README.md +++ b/sql/README.md @@ -139,38 +139,6 @@ Questions we might want to answer http get :3030/cdx?url=eq.https://coleccionables.mercadolibre.com.ar/arduino-pdf_Installments_NoInterest_BestSellers_YES http get :3030/file_meta?sha1hex=eq.120582c855a7cc3c70a8527c560d7f27e6027278 -## Parse URLs - -One approach is to do regexes, something like: - - SELECT substring(column_name FROM '[^/]+://([^/]+)/') AS domain_name FROM table_name; - -Eg: - - SELECT DISTINCT(domain), COUNT(domain) - FROM (select substring(base_url FROM '[^/]+://([^/]*)') as domain FROM ingest_file_result) t1 - WHERE t1.domain != '' - GROUP BY domain - ORDER BY COUNT DESC - LIMIT 10; - -Or: - - SELECT domain, status, COUNT((domain, status)) - FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 - WHERE t1.domain != '' - AND t1.status != 'success' - GROUP BY domain, status - ORDER BY COUNT DESC - LIMIT 10; - -Can also do some quick lookups for a specific domain and protocol like: - - SELECT * - FROM ingest_file_result - WHERE terminal_url LIKE 'https://insights.ovid.com/%' - LIMIT 10; - ## Full Database Dumps Run a dump in compressed, postgres custom format: diff --git a/sql/random_queries.md b/sql/random_queries.md index 604088a..98b32f9 100644 --- a/sql/random_queries.md +++ b/sql/random_queries.md @@ -58,3 +58,65 @@ More stats (2019-12-27): SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 20; SELECT SUM(size_bytes) FROM file_meta; + +"Last 24 hour progress": + + # "problem domains" and statuses + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + AND t1.updated >= NOW() - '1 day'::INTERVAL + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 10; + + # "what type of errors" + SELECT ingest_type, status, COUNT(*) + FROM ingest_file_result + WHERE updated >= NOW() - '1 day'::INTERVAL + GROUP BY ingest_type, status + ORDER BY COUNT DESC + LIMIT 25; + + # "throughput per day for last N days" + SELECT ingest_type, + date(updated), + COUNT(*) as total, + COUNT(CASE status WHEN 'success' THEN 1 ELSE null END) as success + FROM ingest_file_result + WHERE updated >= NOW() - '1 month'::INTERVAL + GROUP BY ingest_type, date(updated) + ORDER BY date(updated) DESC; + +## Parse URLs + +One approach is to do regexes, something like: + + SELECT substring(column_name FROM '[^/]+://([^/]+)/') AS domain_name FROM table_name; + +Eg: + + SELECT DISTINCT(domain), COUNT(domain) + FROM (select substring(base_url FROM '[^/]+://([^/]*)') as domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + GROUP BY domain + ORDER BY COUNT DESC + LIMIT 10; + +Or: + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 10; + +Can also do some quick lookups for a specific domain and protocol like: + + SELECT * + FROM ingest_file_result + WHERE terminal_url LIKE 'https://insights.ovid.com/%' + LIMIT 10; |