diff options
Diffstat (limited to 'sql')
-rw-r--r-- | sql/migrations/2019-12-19-060141_init/up.sql | 1 | ||||
-rw-r--r-- | sql/monitoring_queries.md | 94 | ||||
-rw-r--r-- | sql/random_queries.md | 20 |
3 files changed, 112 insertions, 3 deletions
diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql index a27796b..688487f 100644 --- a/sql/migrations/2019-12-19-060141_init/up.sql +++ b/sql/migrations/2019-12-19-060141_init/up.sql @@ -37,6 +37,7 @@ CREATE TABLE IF NOT EXISTS file_meta ( size_bytes BIGINT, mimetype TEXT CHECK (octet_length(mimetype) >= 1) ); +CREATE INDEX file_meta_md5hex_idx ON file_meta(md5hex); CREATE TABLE IF NOT EXISTS fatcat_file ( sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), diff --git a/sql/monitoring_queries.md b/sql/monitoring_queries.md new file mode 100644 index 0000000..35eef8f --- /dev/null +++ b/sql/monitoring_queries.md @@ -0,0 +1,94 @@ + +## fatcat-changelog pipeline + +Overall ingest status, past 3 days: + + SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '3 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + GROUP BY ingest_file_result.ingest_type, ingest_file_result.status + ORDER BY COUNT DESC + LIMIT 20; + +Broken domains, past 3 days: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '3 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 25; + +Throughput per day, and success, for past month: + + SELECT ingest_request.ingest_type, + date(ingest_request.created), + COUNT(*) as total, + COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '1 month'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_file_result.updated) + ORDER BY date(ingest_file_result.updated) DESC; + +## fatcat-ingest + +Broken domains, past 7 days: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '7 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-ingest' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 25; + +Throughput per day, and success, for past 7 days: + + SELECT ingest_request.ingest_type, + date(ingest_file_result.updated), + COUNT(*) as total, + COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '7 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-ingest' + GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_file_result.updated) + ORDER BY date(ingest_file_result.updated) DESC; diff --git a/sql/random_queries.md b/sql/random_queries.md index d88f45b..572b4f9 100644 --- a/sql/random_queries.md +++ b/sql/random_queries.md @@ -117,9 +117,23 @@ Or: Can also do some quick lookups for a specific domain and protocol like: SELECT * - FROM ingest_file_result - WHERE terminal_url LIKE 'https://insights.ovid.com/%' - LIMIT 10; + FROM ingest_file_result + WHERE terminal_url LIKE 'https://insights.ovid.com/%' + LIMIT 10; + +For a given DOI prefix: + + SELECT * + FROM ingest_file_result + WHERE base_url LIKE 'https://doi.org/10.17223/a%' + AND status = 'no-pdf-link' + LIMIT 10; + + SELECT status, count(*) + FROM ingest_file_result + WHERE base_url LIKE 'https://doi.org/10.17223/%' + GROUP BY status + ORDER BY count(*) DESC; ## Bulk Ingest |