1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
|
## Counts / Status
SELECT status_code, COUNT(*) FROM pdftrio GROUP BY status_code;
# NOTE: I earlier deleted a large fraction of non-200 status codes, so
# these aren't representative
status_code | count
-------------+---------
-4 | 16
-2 | 26
200 | 1117501
400 | 2695
(4 rows)
SELECT status, COUNT(*) FROM pdftrio GROUP BY status;
status | count
---------------+---------
error | 2696
error-connect | 26
error-timeout | 16
success | 1118252
(4 rows)
SELECT
COUNT(CASE WHEN ensemble_score IS NOT NULL THEN 1 ELSE NULL END) as ensemble_count,
COUNT(CASE WHEN linear_score IS NOT NULL THEN 1 ELSE NULL END) as linear_count,
COUNT(CASE WHEN bert_score IS NOT NULL THEN 1 ELSE NULL END) as bert_count,
COUNT(CASE WHEN image_score IS NOT NULL THEN 1 ELSE NULL END) as image_count
FROM pdftrio;
ensemble_count | linear_count | bert_count | image_count
----------------+--------------+------------+-------------
1120100 | 976271 | 66209 | 143829
(1 row)
## Histograms
SELECT width_bucket(ensemble_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio
WHERE status = 'success'
AND ensemble_score IS NOT NULL
GROUP BY buckets
ORDER BY buckets;
SELECT width_bucket(bert_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio
WHERE status = 'success'
AND bert_score IS NOT NULL
GROUP BY buckets
ORDER BY buckets;
SELECT width_bucket(linear_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio
WHERE status = 'success'
AND linear_score IS NOT NULL
GROUP BY buckets
ORDER BY buckets;
SELECT width_bucket(image_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio
WHERE status = 'success'
AND image_score IS NOT NULL
GROUP BY buckets
ORDER BY buckets;
|