1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
|
## fatcat-changelog pipeline
Overall ingest status, past 30 days:
SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*)
FROM ingest_file_result
LEFT JOIN ingest_request
ON ingest_file_result.ingest_type = ingest_request.ingest_type
AND ingest_file_result.base_url = ingest_request.base_url
WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL
AND ingest_request.ingest_type = 'pdf'
AND ingest_request.ingest_request_source = 'fatcat-changelog'
GROUP BY ingest_file_result.ingest_type, ingest_file_result.status
ORDER BY COUNT DESC
LIMIT 20;
Broken domains, past 30 days:
SELECT domain, status, COUNT((domain, status))
FROM (
SELECT
ingest_file_result.ingest_type,
ingest_file_result.status,
substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
FROM ingest_file_result
LEFT JOIN ingest_request
ON ingest_file_result.ingest_type = ingest_request.ingest_type
AND ingest_file_result.base_url = ingest_request.base_url
WHERE
-- ingest_request.created >= NOW() - '3 day'::INTERVAL
ingest_file_result.updated >= NOW() - '30 day'::INTERVAL
AND ingest_request.ingest_type = 'pdf'
AND ingest_request.ingest_request_source = 'fatcat-changelog'
) t1
WHERE t1.domain != ''
AND t1.status != 'success'
GROUP BY domain, status
ORDER BY COUNT DESC
LIMIT 25;
Throughput per day, and success, for past 30 days:
SELECT ingest_request.ingest_type,
date(ingest_request.created),
COUNT(*) as total,
COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success
FROM ingest_file_result
LEFT JOIN ingest_request
ON ingest_file_result.ingest_type = ingest_request.ingest_type
AND ingest_file_result.base_url = ingest_request.base_url
WHERE ingest_request.created >= NOW() - '1 month'::INTERVAL
AND ingest_request.ingest_type = 'pdf'
AND ingest_request.ingest_request_source = 'fatcat-changelog'
GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_request.created)
ORDER BY date(ingest_request.created) DESC;
## fatcat-ingest
Broken domains, past 7 days:
SELECT domain, status, COUNT((domain, status))
FROM (
SELECT
ingest_file_result.ingest_type,
ingest_file_result.status,
substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
FROM ingest_file_result
LEFT JOIN ingest_request
ON ingest_file_result.ingest_type = ingest_request.ingest_type
AND ingest_file_result.base_url = ingest_request.base_url
WHERE
-- ingest_request.created >= NOW() - '7 day'::INTERVAL
ingest_file_result.updated >= NOW() - '24 hour'::INTERVAL
AND ingest_request.ingest_type = 'pdf'
AND ingest_request.ingest_request_source = 'fatcat-ingest'
) t1
WHERE t1.domain != ''
AND t1.status != 'success'
GROUP BY domain, status
ORDER BY COUNT DESC
LIMIT 25;
Throughput per day, and success, for past 7 days:
SELECT ingest_request.ingest_type,
date(ingest_file_result.updated),
COUNT(*) as total,
COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success
FROM ingest_file_result
LEFT JOIN ingest_request
ON ingest_file_result.ingest_type = ingest_request.ingest_type
AND ingest_file_result.base_url = ingest_request.base_url
WHERE
-- ingest_request.created >= NOW() - '7 day'::INTERVAL
ingest_file_result.updated >= NOW() - '24 hour'::INTERVAL
AND ingest_request.ingest_type = 'pdf'
AND ingest_request.ingest_request_source = 'fatcat-ingest'
GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_file_result.updated)
ORDER BY date(ingest_file_result.updated) DESC;
Overall status, updated requests past 3 days:
SELECT ingest_request.ingest_type,
ingest_file_result.status,
COUNT(*)
FROM ingest_file_result
LEFT JOIN ingest_request
ON ingest_file_result.ingest_type = ingest_request.ingest_type
AND ingest_file_result.base_url = ingest_request.base_url
WHERE
-- ingest_file_result.updated >= NOW() - '3 day'::INTERVAL
ingest_file_result.updated >= NOW() - '48 hour'::INTERVAL
AND ingest_request.ingest_type = 'pdf'
AND ingest_request.ingest_request_source = 'fatcat-ingest'
GROUP BY ingest_request.ingest_type, ingest_file_result.status
ORDER BY COUNT(*) DESC;
|