diff options
Diffstat (limited to 'sql/random_queries.md')
-rw-r--r-- | sql/random_queries.md | 193 |
1 files changed, 193 insertions, 0 deletions
diff --git a/sql/random_queries.md b/sql/random_queries.md new file mode 100644 index 0000000..572b4f9 --- /dev/null +++ b/sql/random_queries.md @@ -0,0 +1,193 @@ + +Basic stats (2019-09-23): + + SELECT COUNT(*) FROM cdx WHERE NOT EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex); + => 28,023,760 + => Time: 253897.213 ms (04:13.897) + + SELECT COUNT(DISTINCT sha1hex) FROM cdx WHERE NOT EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex); + => 22,816,087 + => Time: 287097.944 ms (04:47.098) + + SELECT COUNT(*) FROM grobid. + => 56,196,992 + + SELECT COUNT(DISTINCT sha1hex) FROM cdx; + => 64,348,277 + => Time: 572383.931 ms (09:32.384) + + SELECT COUNT(*) FROM cdx; + => 74,796,777 + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC; + => Time: 189067.335 ms (03:09.067) + + mimetype | count + ------------------------+---------- + application/pdf | 51049905 + text/html | 24841846 + text/xml | 524682 + application/postscript | 81009 + (4 rows) + +Time: 189067.335 ms (03:09.067) + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY count(*) DESC; + + status_code | count + -------------+---------- + 200 | 56196992 + + compare with older sandcrawler/output-prod/2019-05-28-1920.35-statuscodecount: + + 200 49567139 + 400 3464503 + 409 691917 + 500 247028 + 503 123 + + SELECT row_to_json(cdx) FROM cdx LIMIT 5; + + SELECT row_to_json(r) FROM ( + SELECT url, datetime FROM cdx + ) r + LIMIT 5; + +More stats (2019-12-27): + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 20; + + SELECT SUM(size_bytes) FROM file_meta; + +"Last 24 hour progress": + + # "problem domains" and statuses + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + AND t1.updated >= NOW() - '1 day'::INTERVAL + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 10; + + # "what type of errors" + SELECT ingest_type, status, COUNT(*) + FROM ingest_file_result + WHERE updated >= NOW() - '1 day'::INTERVAL + GROUP BY ingest_type, status + ORDER BY COUNT DESC + LIMIT 25; + + # "throughput per day for last N days" + SELECT ingest_type, + date(updated), + COUNT(*) as total, + COUNT(CASE status WHEN 'success' THEN 1 ELSE null END) as success + FROM ingest_file_result + WHERE updated >= NOW() - '1 month'::INTERVAL + GROUP BY ingest_type, date(updated) + ORDER BY date(updated) DESC; + +## Parse URLs + +One approach is to do regexes, something like: + + SELECT substring(column_name FROM '[^/]+://([^/]+)/') AS domain_name FROM table_name; + +Eg: + + SELECT DISTINCT(domain), COUNT(domain) + FROM (select substring(base_url FROM '[^/]+://([^/]*)') as domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + GROUP BY domain + ORDER BY COUNT DESC + LIMIT 10; + +Or: + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 10; + +Can also do some quick lookups for a specific domain and protocol like: + + SELECT * + FROM ingest_file_result + WHERE terminal_url LIKE 'https://insights.ovid.com/%' + LIMIT 10; + +For a given DOI prefix: + + SELECT * + FROM ingest_file_result + WHERE base_url LIKE 'https://doi.org/10.17223/a%' + AND status = 'no-pdf-link' + LIMIT 10; + + SELECT status, count(*) + FROM ingest_file_result + WHERE base_url LIKE 'https://doi.org/10.17223/%' + GROUP BY status + ORDER BY count(*) DESC; + +## Bulk Ingest + +Show bulk ingest status on links *added* in the past week: + + SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL + AND ingest_request.link_source = 'unpaywall' + GROUP BY ingest_file_result.ingest_type, ingest_file_result.status + ORDER BY COUNT DESC + LIMIT 25; + +Top *successful* domains: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '7 day'::INTERVAL + AND ingest_request.link_source = 'unpaywall' + ) t1 + WHERE t1.domain != '' + AND t1.status = 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 20; + +Summarize non-success domains for the same: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '7 day'::INTERVAL + AND ingest_request.link_source = 'unpaywall' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 20; |