diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-03-10 22:38:19 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-03-10 22:38:19 -0700 | 
| commit | 19e094f820e7c619b9180616daf1586c4daa66bd (patch) | |
| tree | 7442ffe4f7c37b32f2a049d3d0ba9dab14f39c82 /sql | |
| parent | a53699e21f434adb06263763c0743ca4413eef91 (diff) | |
| download | sandcrawler-19e094f820e7c619b9180616daf1586c4daa66bd.tar.gz sandcrawler-19e094f820e7c619b9180616daf1586c4daa66bd.zip  | |
helpful daily/weekly monitoring SQL queries
Diffstat (limited to 'sql')
| -rw-r--r-- | sql/monitoring_queries.md | 94 | 
1 files changed, 94 insertions, 0 deletions
diff --git a/sql/monitoring_queries.md b/sql/monitoring_queries.md new file mode 100644 index 0000000..b46e6ec --- /dev/null +++ b/sql/monitoring_queries.md @@ -0,0 +1,94 @@ + +## fatcat-changelog pipeline + +Overall ingest status, past 3 days: + +    SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*) +    FROM ingest_file_result +    LEFT JOIN ingest_request +        ON ingest_file_result.ingest_type = ingest_request.ingest_type +        AND ingest_file_result.base_url = ingest_request.base_url +    WHERE ingest_file_result.updated >= NOW() - '3 day'::INTERVAL +        AND ingest_request.ingest_type = 'pdf' +        AND ingest_request.ingest_request_source = 'fatcat-changelog' +    GROUP BY ingest_file_result.ingest_type, ingest_file_result.status +    ORDER BY COUNT DESC +    LIMIT 20; + +Broken domains, past 3 days: + +    SELECT domain, status, COUNT((domain, status)) +    FROM ( +        SELECT +            ingest_file_result.ingest_type, +            ingest_file_result.status, +            substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain +        FROM ingest_file_result +        LEFT JOIN ingest_request +            ON ingest_file_result.ingest_type = ingest_request.ingest_type +            AND ingest_file_result.base_url = ingest_request.base_url +        WHERE ingest_request.created >= NOW() - '3 day'::INTERVAL +            AND ingest_request.ingest_type = 'pdf' +            AND ingest_request.ingest_request_source = 'fatcat-changelog' +    ) t1 +    WHERE t1.domain != '' +        AND t1.status != 'success' +    GROUP BY domain, status +    ORDER BY COUNT DESC +    LIMIT 25; + +Throughput per day, and success, for past month: + +    SELECT ingest_request.ingest_type, +           date(ingest_file_result.updated), +           COUNT(*) as total, +           COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success +    FROM ingest_file_result +    LEFT JOIN ingest_request +        ON ingest_file_result.ingest_type = ingest_request.ingest_type +        AND ingest_file_result.base_url = ingest_request.base_url +    WHERE ingest_request.created >= NOW() - '1 month'::INTERVAL +        AND ingest_request.ingest_type = 'pdf' +        AND ingest_request.ingest_request_source = 'fatcat-changelog' +    GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_file_result.updated) +    ORDER BY date(ingest_file_result.updated) DESC; + +## fatcat-ingest + +Broken domains, past 7 days: + +    SELECT domain, status, COUNT((domain, status)) +    FROM ( +        SELECT +            ingest_file_result.ingest_type, +            ingest_file_result.status, +            substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain +        FROM ingest_file_result +        LEFT JOIN ingest_request +            ON ingest_file_result.ingest_type = ingest_request.ingest_type +            AND ingest_file_result.base_url = ingest_request.base_url +        WHERE ingest_request.created >= NOW() - '7 day'::INTERVAL +            AND ingest_request.ingest_type = 'pdf' +            AND ingest_request.ingest_request_source = 'fatcat-ingest' +    ) t1 +    WHERE t1.domain != '' +        AND t1.status != 'success' +    GROUP BY domain, status +    ORDER BY COUNT DESC +    LIMIT 25; + +Throughput per day, and success, for past 7 days: + +    SELECT ingest_request.ingest_type, +           date(ingest_file_result.updated), +           COUNT(*) as total, +           COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success +    FROM ingest_file_result +    LEFT JOIN ingest_request +        ON ingest_file_result.ingest_type = ingest_request.ingest_type +        AND ingest_file_result.base_url = ingest_request.base_url +    WHERE ingest_request.created >= NOW() - '7 day'::INTERVAL +        AND ingest_request.ingest_type = 'pdf' +        AND ingest_request.ingest_request_source = 'fatcat-ingest' +    GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_file_result.updated) +    ORDER BY date(ingest_file_result.updated) DESC;  | 
