From 598475f1e29daa75d46fc15c98ed9aab576b8ca7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 24 Dec 2019 13:38:56 -0800 Subject: example queries to run on sqlite --- db_queries.md | 52 ++++++++++++++++++++++++++++++++++++++++ db_queries_homepages.md | 64 +++++++++++++++++++++++++++++++++++++++++++++++++ queries.txt | 52 ---------------------------------------- 3 files changed, 116 insertions(+), 52 deletions(-) create mode 100644 db_queries.md create mode 100644 db_queries_homepages.md delete mode 100644 queries.txt diff --git a/db_queries.md b/db_queries.md new file mode 100644 index 0000000..44eae66 --- /dev/null +++ b/db_queries.md @@ -0,0 +1,52 @@ + + SELECT COUNT(DISTINCT issnl) FROM homepage WHERE domain = 'archive.org'; + +Top publishers that have journals in wayback: + + SELECT publisher, COUNT(*) FROM journal LEFT JOIN homepage ON journal.issnl = homepage.issnl WHERE homepage.domain = 'archive.org' GROUP BY journal.publisher ORDER BY COUNT(*) DESC LIMIT 10; + +Top publishers in general: + + SELECT publisher, COUNT(*) from journal GROUP BY publisher ORDER BY COUNT(*) DESC LIMIT 25; + +Homepage URL counts: + + SELECT COUNT(*) FROM homepage; + SELECT COUNT(DISTINCT issnl) FROM homepage; + SELECT issnl, COUNT(*) from homepage GROUP BY issnl ORDER BY COUNT(*) DESC LIMIT 10; + +Top/redundant URLs and SURTs: + + SELECT surt, COUNT(*) FROM homepage GROUP BY surt ORDER BY COUNT(*) DESC LIMIT 10; + + SELECT publisher, name FROM journal LEFT JOIN homepage ON journal.issnl = homepage.issnl WHERE homepage.surt = 'com,benjamins)/'; + +fulltext coverage by publisher type: + + select publisher_type, avg(ia_frac), avg(preserved_frac), count(*) as journal_count, sum(release_count) as paper_count from journal group by publisher_type order by sum(release_count) desc; + + + + select publisher_type, avg(ia_frac), avg(preserved_frac), count(*) as journal_count, sum(release_count) as paper_count from journal group by publisher_type order by sum(release_count) desc; + + + select publisher, count(*) as journal_count, avg(ia_frac) from journal where ia_frac < 0.05 group by publisher order by count(*) desc limit 10; + + + select country, count(*) from journal group by country order by count(*) desc limit 10; + select country, count(*), sum(release_count) from journal group by country order by sum(release_count) desc limit 10; + + select lang, count(*) from journal group by lang order by count(*) desc limit 10; + select lang, count(*), sum(release_count) from journal group by lang order by sum(release_count) desc limit 10; + +Coverage by sherpa color: + + select sherpa_color, sum(ia_count) from journal group by sherpa_color; + +Blocked domains: + + select domain, count(*), sum(blocked) from homepage group by domain order by sum(blocked) desc limit 20; + +Top duplicated domains: + + select url, count(*) from homepage group by url order by count(*) desc limit 20; diff --git a/db_queries_homepages.md b/db_queries_homepages.md new file mode 100644 index 0000000..6f196cd --- /dev/null +++ b/db_queries_homepages.md @@ -0,0 +1,64 @@ + +Journals without homepages (for vicky): + + .mode csv + .output journals_missing_homepages.all.csv + select * from journal where any_homepage=0; + + .mode csv + .output journals_missing_homepages.sample.csv + select * from journal where any_homepage=0 order by random() limit 25; + + select publisher, count(*) from journal where any_homepage=0 group by publisher order by count(*) desc limit 20; + + publisher count(*) + ------------------------------------------------------------ ---------- + ¤ 33033 + Peter Lang International Academic Publishers 1316 + Elsevier 1079 + Informa UK (Taylor & Francis) 709 + Springer-Verlag 481 + OMICS Publishing Group 417 + Georg Thieme Verlag KG 355 + Wiley (John Wiley & Sons) 344 + SAGE Publications 267 + Science Publishing Group 259 + Al Manhal FZ, LLC 258 + Wiley (Blackwell Publishing) 220 + Bentham Science 211 + Egypts Presidential Specialized Council for Education and Sc 201 + Medknow Publications 199 + Inderscience Enterprises Ltd 177 + African Journals Online 167 + Diva Enterprises Private Limited 166 + Scientific Research Publishing, Inc 140 + Hindawi Limited 135 + + Of 1360 Peter Lang journals, only have homepages for 44 of them. + + Of 64373 journals without homepages: + + - 35% have an ISSN-print and not an ISSN-electronic + - 12% have an ISSN-electronic and not an ISSN-print + - 28% have no ISSN-print/electronic breakdown (and for the others may just be missing) + - 41% have some Crossref record (eg, any DOIs registered) (!) + - 27% already have Wikidata entities + - 51% have any work-level metadata in fatcat, and 23% have at least one paper preserved in fatcat + + + select publisher_type, count(*) from journal where any_homepage=0 group by publisher_type order by count(*) desc; + + publisher_type count(*) + ------------------------------------------------------------ ---------- + ¤ 44612 + longtail 5957 + society 3855 + big5 3548 + commercial 3378 + unipress 1556 + oa 793 + other 369 + repository 232 + archive 66 + scielo 7 + diff --git a/queries.txt b/queries.txt deleted file mode 100644 index 44eae66..0000000 --- a/queries.txt +++ /dev/null @@ -1,52 +0,0 @@ - - SELECT COUNT(DISTINCT issnl) FROM homepage WHERE domain = 'archive.org'; - -Top publishers that have journals in wayback: - - SELECT publisher, COUNT(*) FROM journal LEFT JOIN homepage ON journal.issnl = homepage.issnl WHERE homepage.domain = 'archive.org' GROUP BY journal.publisher ORDER BY COUNT(*) DESC LIMIT 10; - -Top publishers in general: - - SELECT publisher, COUNT(*) from journal GROUP BY publisher ORDER BY COUNT(*) DESC LIMIT 25; - -Homepage URL counts: - - SELECT COUNT(*) FROM homepage; - SELECT COUNT(DISTINCT issnl) FROM homepage; - SELECT issnl, COUNT(*) from homepage GROUP BY issnl ORDER BY COUNT(*) DESC LIMIT 10; - -Top/redundant URLs and SURTs: - - SELECT surt, COUNT(*) FROM homepage GROUP BY surt ORDER BY COUNT(*) DESC LIMIT 10; - - SELECT publisher, name FROM journal LEFT JOIN homepage ON journal.issnl = homepage.issnl WHERE homepage.surt = 'com,benjamins)/'; - -fulltext coverage by publisher type: - - select publisher_type, avg(ia_frac), avg(preserved_frac), count(*) as journal_count, sum(release_count) as paper_count from journal group by publisher_type order by sum(release_count) desc; - - - - select publisher_type, avg(ia_frac), avg(preserved_frac), count(*) as journal_count, sum(release_count) as paper_count from journal group by publisher_type order by sum(release_count) desc; - - - select publisher, count(*) as journal_count, avg(ia_frac) from journal where ia_frac < 0.05 group by publisher order by count(*) desc limit 10; - - - select country, count(*) from journal group by country order by count(*) desc limit 10; - select country, count(*), sum(release_count) from journal group by country order by sum(release_count) desc limit 10; - - select lang, count(*) from journal group by lang order by count(*) desc limit 10; - select lang, count(*), sum(release_count) from journal group by lang order by sum(release_count) desc limit 10; - -Coverage by sherpa color: - - select sherpa_color, sum(ia_count) from journal group by sherpa_color; - -Blocked domains: - - select domain, count(*), sum(blocked) from homepage group by domain order by sum(blocked) desc limit 20; - -Top duplicated domains: - - select url, count(*) from homepage group by url order by count(*) desc limit 20; -- cgit v1.2.3