diff options
-rw-r--r-- | sql/pdftrio_queries.md | 65 |
1 files changed, 65 insertions, 0 deletions
diff --git a/sql/pdftrio_queries.md b/sql/pdftrio_queries.md new file mode 100644 index 0000000..06f718c --- /dev/null +++ b/sql/pdftrio_queries.md @@ -0,0 +1,65 @@ + +## Counts / Status + + SELECT status_code, COUNT(*) FROM pdftrio GROUP BY status_code; + + # NOTE: I earlier deleted a large fraction of non-200 status codes, so + # these aren't representative + status_code | count + -------------+--------- + -4 | 16 + -2 | 26 + 200 | 1117501 + 400 | 2695 + (4 rows) + + + SELECT status, COUNT(*) FROM pdftrio GROUP BY status; + + status | count + ---------------+--------- + error | 2696 + error-connect | 26 + error-timeout | 16 + success | 1118252 + (4 rows) + + SELECT + COUNT(CASE WHEN ensemble_score IS NOT NULL THEN 1 ELSE NULL END) as ensemble_count, + COUNT(CASE WHEN linear_score IS NOT NULL THEN 1 ELSE NULL END) as linear_count, + COUNT(CASE WHEN bert_score IS NOT NULL THEN 1 ELSE NULL END) as bert_count, + COUNT(CASE WHEN image_score IS NOT NULL THEN 1 ELSE NULL END) as image_count + FROM pdftrio; + + + ensemble_count | linear_count | bert_count | image_count + ----------------+--------------+------------+------------- + 1120100 | 976271 | 66209 | 143829 + (1 row) + +## Histograms + + SELECT width_bucket(ensemble_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio + WHERE status = 'success' + AND ensemble_score IS NOT NULL + GROUP BY buckets + ORDER BY buckets; + + SELECT width_bucket(bert_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio + WHERE status = 'success' + AND bert_score IS NOT NULL + GROUP BY buckets + ORDER BY buckets; + + SELECT width_bucket(linear_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio + WHERE status = 'success' + AND linear_score IS NOT NULL + GROUP BY buckets + ORDER BY buckets; + + SELECT width_bucket(image_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio + WHERE status = 'success' + AND image_score IS NOT NULL + GROUP BY buckets + ORDER BY buckets; + |