aboutsummaryrefslogtreecommitdiffstats
path: root/sql/pdftrio_queries.md
blob: 06f718ca294a923bba6166c827ff2382ef9795a2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65

## Counts / Status

    SELECT status_code, COUNT(*) FROM pdftrio GROUP BY status_code;

    # NOTE: I earlier deleted a large fraction of non-200 status codes, so
    # these aren't representative
     status_code |  count  
    -------------+---------
              -4 |      16
              -2 |      26
             200 | 1117501
             400 |    2695
    (4 rows)


    SELECT status, COUNT(*) FROM pdftrio GROUP BY status;

        status     |  count  
    ---------------+---------
     error         |    2696
     error-connect |      26
     error-timeout |      16
     success       | 1118252
    (4 rows)

    SELECT
        COUNT(CASE WHEN ensemble_score IS NOT NULL THEN 1 ELSE NULL END) as ensemble_count,
        COUNT(CASE WHEN linear_score   IS NOT NULL THEN 1 ELSE NULL END) as linear_count,
        COUNT(CASE WHEN bert_score     IS NOT NULL THEN 1 ELSE NULL END) as bert_count,
        COUNT(CASE WHEN image_score    IS NOT NULL THEN 1 ELSE NULL END) as image_count
    FROM pdftrio;


     ensemble_count | linear_count | bert_count | image_count 
    ----------------+--------------+------------+-------------
            1120100 |       976271 |      66209 |      143829
    (1 row)

## Histograms

    SELECT width_bucket(ensemble_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio
    WHERE status = 'success'
        AND ensemble_score IS NOT NULL
    GROUP BY buckets
    ORDER BY buckets;

    SELECT width_bucket(bert_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio
    WHERE status = 'success'
        AND bert_score IS NOT NULL
    GROUP BY buckets
    ORDER BY buckets;

    SELECT width_bucket(linear_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio
    WHERE status = 'success'
        AND linear_score IS NOT NULL
    GROUP BY buckets
    ORDER BY buckets;

    SELECT width_bucket(image_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio
    WHERE status = 'success'
        AND image_score IS NOT NULL
    GROUP BY buckets
    ORDER BY buckets;