From 1fef036f13dbed7ba875096a3c8cc01ba686c770 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 26 Dec 2019 12:49:16 -0800 Subject: move queries list to sqlite-notebook report format --- db_queries.md | 52 -- db_queries_homepages.md | 64 --- reports/report.2019-12-26.html | 1168 ++++++++++++++++++++++++++++++++++++++++ reports/report_template.md | 207 +++++++ 4 files changed, 1375 insertions(+), 116 deletions(-) delete mode 100644 db_queries.md delete mode 100644 db_queries_homepages.md create mode 100644 reports/report.2019-12-26.html create mode 100644 reports/report_template.md diff --git a/db_queries.md b/db_queries.md deleted file mode 100644 index 44eae66..0000000 --- a/db_queries.md +++ /dev/null @@ -1,52 +0,0 @@ - - SELECT COUNT(DISTINCT issnl) FROM homepage WHERE domain = 'archive.org'; - -Top publishers that have journals in wayback: - - SELECT publisher, COUNT(*) FROM journal LEFT JOIN homepage ON journal.issnl = homepage.issnl WHERE homepage.domain = 'archive.org' GROUP BY journal.publisher ORDER BY COUNT(*) DESC LIMIT 10; - -Top publishers in general: - - SELECT publisher, COUNT(*) from journal GROUP BY publisher ORDER BY COUNT(*) DESC LIMIT 25; - -Homepage URL counts: - - SELECT COUNT(*) FROM homepage; - SELECT COUNT(DISTINCT issnl) FROM homepage; - SELECT issnl, COUNT(*) from homepage GROUP BY issnl ORDER BY COUNT(*) DESC LIMIT 10; - -Top/redundant URLs and SURTs: - - SELECT surt, COUNT(*) FROM homepage GROUP BY surt ORDER BY COUNT(*) DESC LIMIT 10; - - SELECT publisher, name FROM journal LEFT JOIN homepage ON journal.issnl = homepage.issnl WHERE homepage.surt = 'com,benjamins)/'; - -fulltext coverage by publisher type: - - select publisher_type, avg(ia_frac), avg(preserved_frac), count(*) as journal_count, sum(release_count) as paper_count from journal group by publisher_type order by sum(release_count) desc; - - - - select publisher_type, avg(ia_frac), avg(preserved_frac), count(*) as journal_count, sum(release_count) as paper_count from journal group by publisher_type order by sum(release_count) desc; - - - select publisher, count(*) as journal_count, avg(ia_frac) from journal where ia_frac < 0.05 group by publisher order by count(*) desc limit 10; - - - select country, count(*) from journal group by country order by count(*) desc limit 10; - select country, count(*), sum(release_count) from journal group by country order by sum(release_count) desc limit 10; - - select lang, count(*) from journal group by lang order by count(*) desc limit 10; - select lang, count(*), sum(release_count) from journal group by lang order by sum(release_count) desc limit 10; - -Coverage by sherpa color: - - select sherpa_color, sum(ia_count) from journal group by sherpa_color; - -Blocked domains: - - select domain, count(*), sum(blocked) from homepage group by domain order by sum(blocked) desc limit 20; - -Top duplicated domains: - - select url, count(*) from homepage group by url order by count(*) desc limit 20; diff --git a/db_queries_homepages.md b/db_queries_homepages.md deleted file mode 100644 index 6f196cd..0000000 --- a/db_queries_homepages.md +++ /dev/null @@ -1,64 +0,0 @@ - -Journals without homepages (for vicky): - - .mode csv - .output journals_missing_homepages.all.csv - select * from journal where any_homepage=0; - - .mode csv - .output journals_missing_homepages.sample.csv - select * from journal where any_homepage=0 order by random() limit 25; - - select publisher, count(*) from journal where any_homepage=0 group by publisher order by count(*) desc limit 20; - - publisher count(*) - ------------------------------------------------------------ ---------- - ¤ 33033 - Peter Lang International Academic Publishers 1316 - Elsevier 1079 - Informa UK (Taylor & Francis) 709 - Springer-Verlag 481 - OMICS Publishing Group 417 - Georg Thieme Verlag KG 355 - Wiley (John Wiley & Sons) 344 - SAGE Publications 267 - Science Publishing Group 259 - Al Manhal FZ, LLC 258 - Wiley (Blackwell Publishing) 220 - Bentham Science 211 - Egypts Presidential Specialized Council for Education and Sc 201 - Medknow Publications 199 - Inderscience Enterprises Ltd 177 - African Journals Online 167 - Diva Enterprises Private Limited 166 - Scientific Research Publishing, Inc 140 - Hindawi Limited 135 - - Of 1360 Peter Lang journals, only have homepages for 44 of them. - - Of 64373 journals without homepages: - - - 35% have an ISSN-print and not an ISSN-electronic - - 12% have an ISSN-electronic and not an ISSN-print - - 28% have no ISSN-print/electronic breakdown (and for the others may just be missing) - - 41% have some Crossref record (eg, any DOIs registered) (!) - - 27% already have Wikidata entities - - 51% have any work-level metadata in fatcat, and 23% have at least one paper preserved in fatcat - - - select publisher_type, count(*) from journal where any_homepage=0 group by publisher_type order by count(*) desc; - - publisher_type count(*) - ------------------------------------------------------------ ---------- - ¤ 44612 - longtail 5957 - society 3855 - big5 3548 - commercial 3378 - unipress 1556 - oa 793 - other 369 - repository 232 - archive 66 - scielo 7 - diff --git a/reports/report.2019-12-26.html b/reports/report.2019-12-26.html new file mode 100644 index 0000000..4991307 --- /dev/null +++ b/reports/report.2019-12-26.html @@ -0,0 +1,1168 @@ + + +

Chocula Journal Aggregate Stats

+
+ + + + + + +
datetime('now')
2019-12-26 20:48:12
QUERY: SELECT datetime('now');
+
+ + + + + + + + + + +
seqnamefile
0main/home/bnewbold/code/chocula/chocula.2019-12-26.sqlite
QUERY: PRAGMA database_list;
+

Overview

+

Top publishers by journal count:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
publisherCOUNT(*)
43367
Elsevier4060
Informa UK (Taylor & Francis)3363
Springer-Verlag2938
SAGE Publications1437
Peter Lang International Academic Publishers1357
Wiley (Blackwell Publishing)1167
Wiley (John Wiley & Sons)1083
Walter de Gruyter GmbH629
Springer (Biomed Central Ltd.)594
Cambridge University Press580
Georg Thieme Verlag KG534
Hindawi Limited533
OMICS Publishing Group504
JSTOR482
Oxford University Press481
Medknow Publications474
Emerald (MCB UP )470
De Gruyter Open Sp. z o.o.451
Inderscience Enterprises Ltd448
Bentham Science435
CAIRN415
Institute of Electrical and Electronics Engineers395
Brill374
OpenEdition374
QUERY: SELECT publisher, COUNT(*)
+FROM journal
+GROUP BY publisher
+ORDER BY COUNT(*) DESC
+LIMIT 25;
+

Top countries by number of journals:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
countryCOUNT(*)
106970
us7547
gb6411
nl2497
de2026
id1645
br1569
no1079
es1069
pl898
QUERY: SELECT  country,
+COUNT(*)
+FROM journal
+GROUP BY country
+ORDER BY COUNT(*) DESC
+LIMIT 10;
+

.. by number of papers:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
countryCOUNT(*)SUM(release_count)
10697034471001
us754722313191
gb641113028344
nl24978089964
de20264391511
ch6981320180
jp6271311086
fr843905916
br1569683462
ca618526316
QUERY: SELECT  country,
+COUNT(*),
+SUM(release_count)
+FROM journal
+GROUP BY country
+ORDER BY SUM(release_count) DESC
+LIMIT 10;
+

Top languages by number of journals:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
langCOUNT(*)
115227
en25571
es738
id587
pt560
de504
fr420
ja330
ru245
it201
QUERY: SELECT  lang,
+COUNT(*)
+FROM journal
+GROUP BY lang
+ORDER BY COUNT(*) DESC
+LIMIT 10;
+

... by number of papers:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
langCOUNT(*)SUM(release_count)
en2557148034446
11522741589472
de5041061813
ja330567233
fr420327575
ru245166286
es738119675
pt560116237
it20198440
zh6245780
QUERY: SELECT  lang,
+COUNT(*),
+SUM(release_count)
+FROM journal
+GROUP BY lang
+ORDER BY SUM(release_count) DESC
+LIMIT 10;
+

Fatcat Fulltext Coverage

+

Fulltext coverage by publisher type:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
publisher_typeAVG(ia_frac)AVG(preserved_frac)journal_countpaper_count
big50.137903749421841750.7774943548375381564340212427
society0.266298926597911430.44775433539414861058818162939
0.189321658203890080.248429231137684456414614187155
unipress0.332859723569843560.566106884134925975496200069
commercial0.227528492702501030.692507460518657167965971082
longtail0.457118657811704350.496898741198817351973147895
oa0.55569165938681920.733465569142608425811343062
repository0.033186532349008130.1597687993504964751993044
other0.13937887599959730.6215256674027421939862821
archive0.310046170420482940.9824231978239752597765813
scielo0.78764537238749770.8017765242492674405432115
QUERY: SELECT  publisher_type,
+AVG(ia_frac),
+AVG(preserved_frac),
+COUNT(*) AS journal_count,
+SUM(release_count) AS paper_count
+FROM journal
+GROUP BY publisher_type
+ORDER BY SUM(release_count) DESC;
+

Top publishers with very little coverage:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
publisherjournal_countAVG(ia_frac)
124890.0007462422716968573
Informa UK (Taylor & Francis)21590.018535223377689123
Elsevier20870.015386849708409962
Springer-Verlag8540.017150697644092168
SAGE Publications8340.01744322992299731
Wiley (Blackwell Publishing)6500.019927080180958748
Wiley (John Wiley & Sons)6470.01600965663955534
CAIRN3630.008910075730899723
Medknow Publications3130.007785659242349944
JSTOR2850.004924726114898372
QUERY: SELECT  publisher,
+COUNT(*) AS journal_count,
+AVG(ia_frac)
+FROM journal
+WHERE ia_frac < 0.05
+GROUP BY publisher
+ORDER BY journal_count DESC
+LIMIT 10;
+

Amount of fulltext by SHERPA/ROMEO journal color::

+
+ + + + + + + + + + + + + + + + + + + + + + + + +
sherpa_colorSUM(ia_count)
5133222
blue804197
green7940396
white484933
yellow1805028
QUERY: SELECT  sherpa_color,
+SUM(ia_count)
+FROM journal
+GROUP BY sherpa_color;
+

Journal Homepages

+

Homepage URL counts:

+
+ + + + + + + + +
unique_urlsjournals_with_hompages
11581977673
QUERY: SELECT COUNT(DISTINCT surt) as unique_urls, COUNT(DISTINCT issnl) as journals_with_hompages FROM homepage;
+

Journals with the most homepage URLs:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
issnlCOUNT(*)
0717-554X9
0185-25748
0328-12058
0328-34458
0379-86828
0717-59068
1246-74058
1415-65558
1641-876X8
1669-23818
QUERY: SELECT  issnl,
+COUNT(*)
+FROM homepage
+GROUP BY issnl
+ORDER BY COUNT(*) DESC
+LIMIT 10;
+

Top/redundant URLs and SURTs:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
surtCOUNT(*)
org,rsc,pubs)/en/ebooks47
com,benjamins)/29
ro,ubbcluj,studia)/serii/index_en.html22
pl,czest,ajd,bg,kernel)/wydawnictwo.php17
id,ac,unimed,jurnal)/12
pl,edu,uwm,wydawnictwo)/artykul/14/czytelnia.html12
pl,krakow,up,pbc)/dlibra/pubindex?dirids=511
org,ecorfan)/bolivia/research_journals.php10
it,minervamedica)/index2.t9
kr,or,koreascience)/journal/aboutjournal.jsp8
QUERY: SELECT  surt,
+COUNT(*)
+FROM homepage
+GROUP BY surt
+ORDER BY COUNT(*) DESC
+LIMIT 10;
+

What is the deal with all those "benjamins" URLs?

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
publishername
John Benjamins Publishing CompanyNOWELE
Studia Uralo-Altaica
John Benjamins Publishing CompanyLanguage Problems and Language Planning
John Benjamins Publishing CompanyLingvisticæ investigationes
John Benjamins Publishing CompanyLinguistics of the TIbeto-Burman Area
John Benjamins Publishing CompanyPragmatics & Cognition
John Benjamins Publishing CompanyTerminology
John Benjamins Publishing CompanyWritten Language & Literacy
John Benjamins Publishing CompanyFORUM: Revue internationale d?interprétation et de traduction / International Journal of Interpretation and Translation
John Benjamins Publishing CompanyEnglish Text Construction
John Benjamins Publishing CompanyConstructions and Frames
John Benjamins Publishing CompanyPragmatics and Society
John Benjamins Publishing CompanyTranslation and Interpreting Studies
John Benjamins Publishing CompanyLanguage and Dialogue
John Benjamins Publishing CompanyMetaphor in Language, Cognition, and Communication
Hamburg Studies on Linguistic Diversity
John Benjamins Publishing CompanyTranslation Spaces
Studies in Arabic Linguistics
John Benjamins Publishing CompanyJournal of Immersion and Content-Based Language Education (JICB)
Children's Literature, Culture, and Cognition
John Benjamins Publishing CompanyJournal of Language Aggression and Conflict
FILLM Studies in Languages and Literatures
Advances in Historical Sociolinguistics
John Benjamins Publishing CompanyLinguistic Landscape
John Benjamins Publishing CompanyInternational Journal of Learner Corpus Research
John Benjamins Publishing CompanyJournal of Second Language Pronunciation
ITL - International Journal of Applied Linguistics
John Benjamins Publishing CompanyCognitive Individual Differences in Second Language Processing and Acquisition
John Benjamins Publishing CompanyStudies in Germanic Linguistics
QUERY: SELECT  publisher,
+name
+FROM journal
+LEFT JOIN homepage ON journal.issnl = homepage.issnl
+WHERE homepage.surt = 'com,benjamins)/';
+

Domains that block us:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
domainjournal_homepagesSUM(blocked)
jstor.org32413241
wiley.com2503229
brill.nl222164
bentham.org149149
emeraldgrouppublishing.com7676
uem.br3934
emeraldinsight.com40117
rodopi.nl1915
esaunggul.ac.id1111
ingentaconnect.com1209
ucb.br119
univie.ac.at459
erlbaum.com98
iaster.com77
ucla.edu327
uctjournals.com77
elsevier.com28616
inah.gob.mx75
medicaljournals.se55
mohr.de135
QUERY: SELECT  domain,
+COUNT(*) as journal_homepages,
+SUM(blocked)
+FROM homepage
+GROUP BY domain
+ORDER BY SUM(blocked) DESC
+LIMIT 20;
+

Top duplicated domains:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
urlCOUNT(*)
http://www.studia.ubbcluj.ro/serii/index_en.html22
http://jurnal.unimed.ac.id/12
http://wydawnictwo.uwm.edu.pl/artykul/14/czytelnia.html12
https://benjamins.com/12
http://pbc.up.krakow.pl/dlibra/pubindex?dirids=511
http://www.ecorfan.org/bolivia/research_journals.php9
http://www.minervamedica.it/index2.t9
http://www.koreascience.or.kr/journal/AboutJournal.jsp8
https://www.benjamins.com/8
http://dlibra.up.krakow.pl:8080/dlibra/dlibra/collectiondescription?dirids=57
http://gesundheitsfoerderung.ch/ueber-uns/downloads.html6
http://www.ijmra.us/6
http://www.inderscience.com/browse/index.php6
https://www.iospress.nl/6
http://kernel.bg.ajd.czest.pl/wydawnictwo.php#FP5
http://nsd.no/5
http://www.duei.de/show.php/de/content/publikationen/giga-focus/giga-focus.html5
http://www.hottopos.com/revistas.htm5
http://www.inderscience.com/5
http://www.publicaciones.fahce.unlp.edu.ar/5
QUERY: SELECT  url,
+COUNT(*)
+FROM homepage
+GROUP BY url
+ORDER BY COUNT(*) DESC
+LIMIT 20;
+

Number of journals with a homepage that points to web.archive.org or archive.org:

+
+ + + + + + +
COUNT(DISTINCT issnl)
164
QUERY: SELECT COUNT(DISTINCT issnl)
+FROM homepage
+WHERE domain = 'archive.org';
+

Top publishers that have journals in wayback:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
publisherCOUNT(*)
39
EDP Sciences11
PERSEE Program3
CAIRN2
Fabula2
Institut du monde et du développement pour la bonne gouvernance publique2
ANPAD1
ANR le Saint-Simonisme 18-211
Ad hoc (Rennes)1
Al Manhal FZ, LLC1
QUERY: SELECT  publisher,
+COUNT(*)
+FROM journal
+LEFT JOIN homepage ON journal.issnl = homepage.issnl
+WHERE homepage.domain = 'archive.org'
+GROUP BY journal.publisher
+ORDER BY COUNT(*) DESC
+LIMIT 10;
+

Top publishers by number of journals missing a homepage:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
publisherCOUNT(*)
34205
Peter Lang International Academic Publishers1309
Elsevier1036
Informa UK (Taylor & Francis)650
Springer-Verlag465
OMICS Publishing Group413
Georg Thieme Verlag KG357
Wiley (John Wiley & Sons)330
Egypts Presidential Specialized Council for Education and Scientific Research266
Science Publishing Group260
SAGE Publications256
Al Manhal FZ, LLC250
Bentham Science214
Wiley (Blackwell Publishing)212
Medknow Publications203
Inderscience Enterprises Ltd170
African Journals Online166
Diva Enterprises Private Limited166
PERSEE Program142
Scientific Research Publishing, Inc139
QUERY: SELECT  publisher,
+COUNT(*)
+FROM journal
+WHERE any_homepage=0
+GROUP BY publisher
+ORDER BY COUNT(*) DESC
+LIMIT 20;
+
diff --git a/reports/report_template.md b/reports/report_template.md new file mode 100644 index 0000000..ac98649 --- /dev/null +++ b/reports/report_template.md @@ -0,0 +1,207 @@ + + + +# Chocula Journal Aggregate Stats + + +```sql +SELECT datetime('now'); +``` + +```sql +PRAGMA database_list; +``` + +## Overview + +Top publishers by journal count: + +```sql +SELECT publisher, COUNT(*) +FROM journal +GROUP BY publisher +ORDER BY COUNT(*) DESC +LIMIT 25; +``` + +Top countries by number of journals: + +```sql +SELECT country, + COUNT(*) +FROM journal +GROUP BY country +ORDER BY COUNT(*) DESC +LIMIT 10; +``` + +.. by number of papers: + +```sql +SELECT country, + COUNT(*), + SUM(release_count) +FROM journal +GROUP BY country +ORDER BY SUM(release_count) DESC +LIMIT 10; +``` + +Top languages by number of journals: + +```sql +SELECT lang, + COUNT(*) +FROM journal +GROUP BY lang +ORDER BY COUNT(*) DESC +LIMIT 10; +``` + +... by number of papers: + +```sql +SELECT lang, + COUNT(*), + SUM(release_count) +FROM journal +GROUP BY lang +ORDER BY SUM(release_count) DESC +LIMIT 10; +``` + +## Fatcat Fulltext Coverage + +Fulltext coverage by publisher type: + +```sql +SELECT publisher_type, + AVG(ia_frac), + AVG(preserved_frac), + COUNT(*) AS journal_count, + SUM(release_count) AS paper_count +FROM journal +GROUP BY publisher_type +ORDER BY SUM(release_count) DESC; +``` + +Top publishers with very little coverage: + +```sql +SELECT publisher, + COUNT(*) AS journal_count, + AVG(ia_frac) +FROM journal +WHERE ia_frac < 0.05 +GROUP BY publisher +ORDER BY journal_count DESC +LIMIT 10; +``` + +Amount of fulltext by SHERPA/ROMEO journal color:: + +```sql +SELECT sherpa_color, + SUM(ia_count) +FROM journal +GROUP BY sherpa_color; +``` + +## Journal Homepages + +Homepage URL counts: + +```sql +SELECT COUNT(DISTINCT surt) as unique_urls, COUNT(DISTINCT issnl) as journals_with_hompages FROM homepage; +``` + +Journals with the most homepage URLs: + +```sql +SELECT issnl, + COUNT(*) +FROM homepage +GROUP BY issnl +ORDER BY COUNT(*) DESC +LIMIT 10; +``` + +Top/redundant URLs and SURTs: + +```sql +SELECT surt, + COUNT(*) +FROM homepage +GROUP BY surt +ORDER BY COUNT(*) DESC +LIMIT 10; +``` + +What is the deal with all those "benjamins" URLs? + +```sql +SELECT publisher, + name +FROM journal +LEFT JOIN homepage ON journal.issnl = homepage.issnl +WHERE homepage.surt = 'com,benjamins)/'; +``` + +Domains that block us: + +```sql +SELECT domain, + COUNT(*) as journal_homepages, + SUM(blocked) +FROM homepage +GROUP BY domain +ORDER BY SUM(blocked) DESC +LIMIT 20; +``` + +Top duplicated domains: + +```sql +SELECT url, + COUNT(*) +FROM homepage +GROUP BY url +ORDER BY COUNT(*) DESC +LIMIT 20; +``` + +Number of journals with a homepage that points to web.archive.org or archive.org: + +```sql +SELECT COUNT(DISTINCT issnl) +FROM homepage +WHERE domain = 'archive.org'; +``` + +Top publishers that have journals in wayback: + +```sql +SELECT publisher, + COUNT(*) +FROM journal +LEFT JOIN homepage ON journal.issnl = homepage.issnl +WHERE homepage.domain = 'archive.org' +GROUP BY journal.publisher +ORDER BY COUNT(*) DESC +LIMIT 10; +``` +Top publishers by number of journals missing a homepage: + +```sql +SELECT publisher, + COUNT(*) +FROM journal +WHERE any_homepage=0 +GROUP BY publisher +ORDER BY COUNT(*) DESC +LIMIT 20; +``` + -- cgit v1.2.3