From 5f9e7fd4c89df98ed90be9629d3dc6c201b42a02 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 3 Feb 2020 21:50:51 -0800 Subject: more random sandcrawler-db queries --- sql/README.md | 32 -------------------------------- 1 file changed, 32 deletions(-) (limited to 'sql/README.md') diff --git a/sql/README.md b/sql/README.md index d3e838c..42dba31 100644 --- a/sql/README.md +++ b/sql/README.md @@ -139,38 +139,6 @@ Questions we might want to answer http get :3030/cdx?url=eq.https://coleccionables.mercadolibre.com.ar/arduino-pdf_Installments_NoInterest_BestSellers_YES http get :3030/file_meta?sha1hex=eq.120582c855a7cc3c70a8527c560d7f27e6027278 -## Parse URLs - -One approach is to do regexes, something like: - - SELECT substring(column_name FROM '[^/]+://([^/]+)/') AS domain_name FROM table_name; - -Eg: - - SELECT DISTINCT(domain), COUNT(domain) - FROM (select substring(base_url FROM '[^/]+://([^/]*)') as domain FROM ingest_file_result) t1 - WHERE t1.domain != '' - GROUP BY domain - ORDER BY COUNT DESC - LIMIT 10; - -Or: - - SELECT domain, status, COUNT((domain, status)) - FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 - WHERE t1.domain != '' - AND t1.status != 'success' - GROUP BY domain, status - ORDER BY COUNT DESC - LIMIT 10; - -Can also do some quick lookups for a specific domain and protocol like: - - SELECT * - FROM ingest_file_result - WHERE terminal_url LIKE 'https://insights.ovid.com/%' - LIMIT 10; - ## Full Database Dumps Run a dump in compressed, postgres custom format: -- cgit v1.2.3