diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-06-25 16:32:11 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-25 16:32:11 -0700 |
commit | 561a1afe1c5c55a21f9b1db52344e58b2635a3a6 (patch) | |
tree | db39a9b7a92a45895bf8659142e7caef5083966f /sql | |
parent | 93b8d7364359ba7f1a4adbc022286abcdf310834 (diff) | |
download | sandcrawler-561a1afe1c5c55a21f9b1db52344e58b2635a3a6.tar.gz sandcrawler-561a1afe1c5c55a21f9b1db52344e58b2635a3a6.zip |
pdftrio SQL queries
Diffstat (limited to 'sql')
-rw-r--r-- | sql/pdftrio_queries.md | 65 |
1 files changed, 65 insertions, 0 deletions
diff --git a/sql/pdftrio_queries.md b/sql/pdftrio_queries.md new file mode 100644 index 0000000..06f718c --- /dev/null +++ b/sql/pdftrio_queries.md @@ -0,0 +1,65 @@ + +## Counts / Status + + SELECT status_code, COUNT(*) FROM pdftrio GROUP BY status_code; + + # NOTE: I earlier deleted a large fraction of non-200 status codes, so + # these aren't representative + status_code | count + -------------+--------- + -4 | 16 + -2 | 26 + 200 | 1117501 + 400 | 2695 + (4 rows) + + + SELECT status, COUNT(*) FROM pdftrio GROUP BY status; + + status | count + ---------------+--------- + error | 2696 + error-connect | 26 + error-timeout | 16 + success | 1118252 + (4 rows) + + SELECT + COUNT(CASE WHEN ensemble_score IS NOT NULL THEN 1 ELSE NULL END) as ensemble_count, + COUNT(CASE WHEN linear_score IS NOT NULL THEN 1 ELSE NULL END) as linear_count, + COUNT(CASE WHEN bert_score IS NOT NULL THEN 1 ELSE NULL END) as bert_count, + COUNT(CASE WHEN image_score IS NOT NULL THEN 1 ELSE NULL END) as image_count + FROM pdftrio; + + + ensemble_count | linear_count | bert_count | image_count + ----------------+--------------+------------+------------- + 1120100 | 976271 | 66209 | 143829 + (1 row) + +## Histograms + + SELECT width_bucket(ensemble_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio + WHERE status = 'success' + AND ensemble_score IS NOT NULL + GROUP BY buckets + ORDER BY buckets; + + SELECT width_bucket(bert_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio + WHERE status = 'success' + AND bert_score IS NOT NULL + GROUP BY buckets + ORDER BY buckets; + + SELECT width_bucket(linear_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio + WHERE status = 'success' + AND linear_score IS NOT NULL + GROUP BY buckets + ORDER BY buckets; + + SELECT width_bucket(image_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio + WHERE status = 'success' + AND image_score IS NOT NULL + GROUP BY buckets + ORDER BY buckets; + |