aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-25 16:32:11 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-25 16:32:11 -0700
commit561a1afe1c5c55a21f9b1db52344e58b2635a3a6 (patch)
treedb39a9b7a92a45895bf8659142e7caef5083966f
parent93b8d7364359ba7f1a4adbc022286abcdf310834 (diff)
downloadsandcrawler-561a1afe1c5c55a21f9b1db52344e58b2635a3a6.tar.gz
sandcrawler-561a1afe1c5c55a21f9b1db52344e58b2635a3a6.zip
pdftrio SQL queries
-rw-r--r--sql/pdftrio_queries.md65
1 files changed, 65 insertions, 0 deletions
diff --git a/sql/pdftrio_queries.md b/sql/pdftrio_queries.md
new file mode 100644
index 0000000..06f718c
--- /dev/null
+++ b/sql/pdftrio_queries.md
@@ -0,0 +1,65 @@
+
+## Counts / Status
+
+ SELECT status_code, COUNT(*) FROM pdftrio GROUP BY status_code;
+
+ # NOTE: I earlier deleted a large fraction of non-200 status codes, so
+ # these aren't representative
+ status_code | count
+ -------------+---------
+ -4 | 16
+ -2 | 26
+ 200 | 1117501
+ 400 | 2695
+ (4 rows)
+
+
+ SELECT status, COUNT(*) FROM pdftrio GROUP BY status;
+
+ status | count
+ ---------------+---------
+ error | 2696
+ error-connect | 26
+ error-timeout | 16
+ success | 1118252
+ (4 rows)
+
+ SELECT
+ COUNT(CASE WHEN ensemble_score IS NOT NULL THEN 1 ELSE NULL END) as ensemble_count,
+ COUNT(CASE WHEN linear_score IS NOT NULL THEN 1 ELSE NULL END) as linear_count,
+ COUNT(CASE WHEN bert_score IS NOT NULL THEN 1 ELSE NULL END) as bert_count,
+ COUNT(CASE WHEN image_score IS NOT NULL THEN 1 ELSE NULL END) as image_count
+ FROM pdftrio;
+
+
+ ensemble_count | linear_count | bert_count | image_count
+ ----------------+--------------+------------+-------------
+ 1120100 | 976271 | 66209 | 143829
+ (1 row)
+
+## Histograms
+
+ SELECT width_bucket(ensemble_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio
+ WHERE status = 'success'
+ AND ensemble_score IS NOT NULL
+ GROUP BY buckets
+ ORDER BY buckets;
+
+ SELECT width_bucket(bert_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio
+ WHERE status = 'success'
+ AND bert_score IS NOT NULL
+ GROUP BY buckets
+ ORDER BY buckets;
+
+ SELECT width_bucket(linear_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio
+ WHERE status = 'success'
+ AND linear_score IS NOT NULL
+ GROUP BY buckets
+ ORDER BY buckets;
+
+ SELECT width_bucket(image_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio
+ WHERE status = 'success'
+ AND image_score IS NOT NULL
+ GROUP BY buckets
+ ORDER BY buckets;
+