aboutsummaryrefslogtreecommitdiffstats
path: root/queries.txt
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-12-24 13:38:56 -0800
committerBryan Newbold <bnewbold@archive.org>2019-12-24 13:38:56 -0800
commit598475f1e29daa75d46fc15c98ed9aab576b8ca7 (patch)
tree54a6fb9cbf661eb43310c410ed8c4671820827c5 /queries.txt
parent46e4b69c28f6132e3ae08a2e6e5bbb065458de28 (diff)
downloadchocula-598475f1e29daa75d46fc15c98ed9aab576b8ca7.tar.gz
chocula-598475f1e29daa75d46fc15c98ed9aab576b8ca7.zip
example queries to run on sqlite
Diffstat (limited to 'queries.txt')
-rw-r--r--queries.txt52
1 files changed, 0 insertions, 52 deletions
diff --git a/queries.txt b/queries.txt
deleted file mode 100644
index 44eae66..0000000
--- a/queries.txt
+++ /dev/null
@@ -1,52 +0,0 @@
-
- SELECT COUNT(DISTINCT issnl) FROM homepage WHERE domain = 'archive.org';
-
-Top publishers that have journals in wayback:
-
- SELECT publisher, COUNT(*) FROM journal LEFT JOIN homepage ON journal.issnl = homepage.issnl WHERE homepage.domain = 'archive.org' GROUP BY journal.publisher ORDER BY COUNT(*) DESC LIMIT 10;
-
-Top publishers in general:
-
- SELECT publisher, COUNT(*) from journal GROUP BY publisher ORDER BY COUNT(*) DESC LIMIT 25;
-
-Homepage URL counts:
-
- SELECT COUNT(*) FROM homepage;
- SELECT COUNT(DISTINCT issnl) FROM homepage;
- SELECT issnl, COUNT(*) from homepage GROUP BY issnl ORDER BY COUNT(*) DESC LIMIT 10;
-
-Top/redundant URLs and SURTs:
-
- SELECT surt, COUNT(*) FROM homepage GROUP BY surt ORDER BY COUNT(*) DESC LIMIT 10;
-
- SELECT publisher, name FROM journal LEFT JOIN homepage ON journal.issnl = homepage.issnl WHERE homepage.surt = 'com,benjamins)/';
-
-fulltext coverage by publisher type:
-
- select publisher_type, avg(ia_frac), avg(preserved_frac), count(*) as journal_count, sum(release_count) as paper_count from journal group by publisher_type order by sum(release_count) desc;
-
-
-
- select publisher_type, avg(ia_frac), avg(preserved_frac), count(*) as journal_count, sum(release_count) as paper_count from journal group by publisher_type order by sum(release_count) desc;
-
-
- select publisher, count(*) as journal_count, avg(ia_frac) from journal where ia_frac < 0.05 group by publisher order by count(*) desc limit 10;
-
-
- select country, count(*) from journal group by country order by count(*) desc limit 10;
- select country, count(*), sum(release_count) from journal group by country order by sum(release_count) desc limit 10;
-
- select lang, count(*) from journal group by lang order by count(*) desc limit 10;
- select lang, count(*), sum(release_count) from journal group by lang order by sum(release_count) desc limit 10;
-
-Coverage by sherpa color:
-
- select sherpa_color, sum(ia_count) from journal group by sherpa_color;
-
-Blocked domains:
-
- select domain, count(*), sum(blocked) from homepage group by domain order by sum(blocked) desc limit 20;
-
-Top duplicated domains:
-
- select url, count(*) from homepage group by url order by count(*) desc limit 20;