From b924125091eee0a537ae9018bc331eb74a738daf Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 7 May 2020 16:01:57 -0700 Subject: move journal summary files to reports folder --- reports/journal_summary.2019-07-31.html | 1018 +++++++++++++++++++++++++++++++ reports/journal_summary.md | 134 ++++ 2 files changed, 1152 insertions(+) create mode 100644 reports/journal_summary.2019-07-31.html create mode 100644 reports/journal_summary.md (limited to 'reports') diff --git a/reports/journal_summary.2019-07-31.html b/reports/journal_summary.2019-07-31.html new file mode 100644 index 0000000..f25a215 --- /dev/null +++ b/reports/journal_summary.2019-07-31.html @@ -0,0 +1,1018 @@ +

Fatcat "Chocula" Journal Metadata Summary

+

This report is auto-generated from a sqlite database file, which should be available/included.

+
+ + + + + + +
datetime('now')
2019-08-01 03:55:43
QUERY: SELECT datetime('now');
+

Note that pretty much all of the fatcat release stats are on a release, not +work basis, so there may be over-counting. Also, as of July 2019 there were +over 1.5 million OA longtail releases which are not linked to a container +(journal).

+

Basics

+

Top countries by journal count (and fatcat release counts):

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
countryjournal_countsum(release_count)
9193134853365
us683820812424
gb596712238711
nl23437763639
de18414176386
id1562112525
br1501614272
es1012275328
pl807256632
it803304793
QUERY: SELECT country, COUNT(*) AS journal_count, sum(release_count) from journal group by country order by count(*) desc limit 10;
+

Top languages by journal count (and fatcat release counts):

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
langjournal_countrelease_count
9685639729766
en2558446389136
es738105717
id58735909
pt56099100
de5041050664
fr420314582
ja330589020
ru245150367
it20297561
QUERY: SELECT lang, COUNT(*) as journal_count, sum(release_count) as release_count FROM journal GROUP BY lang ORDER BY COUNT(*) DESC LIMIT 10;
+

Aggregate fatcat fulltext release coverage by OA status:

+
+ + + + + + + + + + + + + + + + + + + + + +
is_oajournal_countSUM(release_count)SUM(ia_count)total_ia_frac
0749854253388656780630.13
15185046171246103577050.22
QUERY: SELECT is_oa, COUNT(*) AS journal_count, SUM(release_count), SUM(ia_count), ROUND(1. * SUM(ia_count) / SUM(release_count), 2) as total_ia_frac FROM journal GROUP BY is_oa;
+

Publisher Segmentation

+

Big publishers by journal count:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
publisherjournal_countSUM(release_count)
472036504661
Elsevier400916206074
Informa UK (Taylor & Francis)33323921600
Springer-Verlag28755638303
SAGE Publications13722344281
Peter Lang International Academic Publishers1360252
Wiley (Blackwell Publishing)11733640989
Wiley (John Wiley & Sons)10394456867
Walter de Gruyter GmbH624435616
Springer (Biomed Central Ltd.)558450187
Cambridge University Press5531519555
Hindawi Limited521194707
Georg Thieme Verlag KG512688731
OMICS Publishing Group50293785
JSTOR495738890
QUERY: SELECT publisher, COUNT(*) AS journal_count, SUM(release_count) from journal GROUP BY publisher ORDER BY COUNT(*) DESC LIMIT 15;
+

Number of publishers with 3 or fewer journals:

+
+ + + + + + +
COUNT(*)
18307
QUERY: SELECT COUNT(*) FROM (SELECT publisher, COUNT(*) as journal_count FROM journal GROUP BY publisher) WHERE journal_count <= 3;
+

Fulltext coverage by publisher type:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
publisher_typeia_total_fracpreserved_total_fracjournal_countpaper_count
big50.120.891536239334593
society0.250.71854517499721
0.150.285971613550129
commercial0.120.8466086041845
unipress0.240.8460175876070
longtail0.480.54255232216976
oa0.760.8424761180835
repository0.130.3646925092
other0.080.88927861701
archive0.260.98604792273
scielo0.80.81411425897
QUERY: SELECT publisher_type, ROUND(1.0 * SUM(ia_count) / SUM(release_count), 2) as ia_total_frac, ROUND(1.0 * SUM(preserved_count) / SUM(release_count), 2) as preserved_total_frac, count(*) as journal_count, sum(release_count) as paper_count from journal group by publisher_type order by sum(release_count) desc;
+

Fulltext coverage by publisher type (NOTE: averaging fractions without weighing by release count, intentionally):

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
publisher_typeavg_ia_fracavg_preserved_fracjournal_countpaper_count
big50.150.811536239334593
society0.320.53854517499721
0.240.315971613550129
commercial0.260.7666086041845
unipress0.420.6960175876070
longtail0.550.58255232216976
oa0.630.824761180835
repository0.040.19646925092
other0.150.65927861701
archive0.310.98604792273
scielo0.830.85411425897
QUERY: SELECT publisher_type, ROUND(1.0 * AVG(ia_frac), 2) as avg_ia_frac, ROUND(1.0 * AVG(preserved_frac), 2) as avg_preserved_frac, count(*) as journal_count, sum(release_count) as paper_count from journal group by publisher_type order by sum(release_count) DESC;
+

Number of journals with no releases (metadata or fulltext) in fatcat:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
publisher_typejournals_with_no_releases
21195
longtail13194
society1770
commercial1626
unipress1521
big5363
oa85
archive45
other17
repository10
QUERY: SELECT publisher_type, COUNT(*) AS journals_with_no_releases FROM journal WHERE release_count = 0 GROUP BY publisher_type ORDER BY COUNT(*) DESC;
+

IA Fulltext Coverage

+

Coverage by sherpa color:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
sherpa_coloria_fulltext_countrelease_counttotal_ia_frac
5076210263421170.19
blue79938128768910.28
green7873174415163200.19
white48343446327150.1
yellow1803569133370890.14
QUERY: SELECT sherpa_color, SUM(ia_count) as ia_fulltext_count, SUM(release_count) as release_count, ROUND(1.0 * SUM(ia_count) / SUM(release_count), 2) as total_ia_frac FROM journal GROUP BY sherpa_color;
+

Top publishers with very little IA coverage (NOTE: averaging fractions without weight by journal size):

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
publisherjournal_countROUND(avg(ia_frac),3)
132260.001
Informa UK (Taylor & Francis)20810.019
Elsevier20530.015
SAGE Publications7640.018
Springer-Verlag7610.018
Wiley (Blackwell Publishing)6410.02
Wiley (John Wiley & Sons)5930.017
JSTOR2950.005
CAIRN2800.012
Medknow Publications2800.008
QUERY: SELECT publisher, count(*) as journal_count, ROUND(avg(ia_frac),3) from journal where ia_frac < 0.05 group by publisher order by count(*) desc limit 10;
+

Homepages

+

Journal counts by homepage status:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
any_homepageany_live_homepageany_gwb_homepageCOUNT(*)frac
000656140.52
10054340.04
10148430.04
11036240.03
111473200.37
QUERY: SELECT any_homepage, any_live_homepage, any_gwb_homepage, COUNT(*), ROUND(1.0 * COUNT(*) / (SELECT COUNT(*) FROM journal), 2) AS frac FROM journal GROUP BY any_homepage, any_live_homepage, any_gwb_homepage;
+

Number of unique journals that have a homepage pointing to wayback or archive.org:

+
+ + + + + + +
COUNT(DISTINCT issnl)
154
QUERY: SELECT COUNT(DISTINCT issnl) FROM homepage WHERE domain = 'archive.org';
+

Top publishers that have journals in wayback:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
publisherCOUNT(*)
63
EDP Sciences11
PERSEE Program3
CAIRN2
Fabula2
Institut du monde et du développement pour la bonne gouvernance publique2
ANPAD1
Ad hoc (Rennes)1
Asociación Revista Venezolana de Ciencia y Tecnología de Alimentos1
Association Epiga1
QUERY: SELECT publisher, COUNT(*) FROM journal LEFT JOIN homepage ON journal.issnl = homepage.issnl WHERE homepage.domain = 'archive.org' GROUP BY journal.publisher ORDER BY COUNT(*) DESC LIMIT 10;
+

Homepage URL counts:

+
+ + + + + + + + + + +
rowsissnlssurts
839096122182678
QUERY: SELECT COUNT(*) as rows, COUNT(DISTINCT issnl) as issnls, COUNT(DISTINCT surt) as surts FROM homepage;
+

Journals with most unique SURTs:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
issnlCOUNT(*)
0717-34586
1406-42436
2190-59916
0011-67935
0022-98305
0091-67655
0102-76385
0144-84635
0212-65675
0350-154X5
QUERY: SELECT issnl, COUNT(*) from homepage GROUP BY issnl ORDER BY COUNT(*) DESC LIMIT 10;
+

Blocked domains:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
domaincount(*)sum(blocked)
jstor.org32353234
brill.nl216161
wiley.com2372152
bentham.org146146
tandfonline.com291984
cairn.info5249
emeraldgrouppublishing.com4949
emeraldinsight.com39016
rodopi.nl1915
sagepub.com18639
vsppub.com99
iaster.com77
mohr.de157
scienceq.org77
uctjournals.com77
elsevier.com27466
esaunggul.ac.id66
bloomsbury.com44
gov.hu84
inap.es44
QUERY: SELECT domain, count(*), sum(blocked) from homepage group by domain order by sum(blocked) desc limit 20;
+

Top duplicated URLs and SURTs:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
urlCOUNT(*)
http://www.studia.ubbcluj.ro/serii/index_en.html22
http://jurnal.unimed.ac.id/12
https://benjamins.com/10
http://www.ecorfan.org/bolivia/research_journals.php9
http://www.minervamedica.it/index2.t8
https://www.benjamins.com/8
http://gesundheitsfoerderung.ch/ueber-uns/downloads.html6
http://www.inderscience.com/browse/index.php6
http://www.iospress.nl/6
http://edizionicafoscari.unive.it/it/edizioni/collane/antichistica/5
QUERY: SELECT url, COUNT(*) FROM homepage GROUP BY url ORDER BY COUNT(*) DESC LIMIT 10;
+

Top terminal URLs catch cases where many URLs redirect to a single page:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
terminal_urlCOUNT(DISTINCT issnl)
https://portal.eiu.com/Login.aspx?c=1180
https://taylorandfrancis.com151
https://onlinelibrary.wiley.com/131
http://eia.libis.lt/aboutProject.php70
https://www.hindawi.com/journals/isrn/67
https://us.sagepub.com/en-us/nam/insights-journals61
https://metapress.com44
http://blackwell-science.com/41
https://staatsbibliothek-berlin.de/Emedien-Meldungen/Login-Hinweis/40
https://taylorandfrancis.com/37
https://pubs.rsc.org/en/ebooks36
http://explore.tandfonline.com/page/ah/maney-publishing-journals34
http://www.bentham.org/403.shtml27
https://metapress.com/27
http://www.inderscience.com/browse/index.php23
http://www.studia.ubbcluj.ro/serii/index_en.html22
https://www.impresaitalia.info20
https://www.tandfonline.com/20
https://benjamins.com/content/home17
https://content-select.com/login15
QUERY: SELECT terminal_url, COUNT(DISTINCT issnl) FROM homepage WHERE terminal_url IS NOT NULL GROUP BY terminal_url ORDER BY COUNT(DISTINCT issnl) DESC LIMIT 20;
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
surtCOUNT(*)
org,rsc,pubs)/en/ebooks47
com,benjamins)/27
ro,ubbcluj,studia)/serii/index_en.html22
id,ac,unimed,jurnal)/12
org,ecorfan)/bolivia/research_journals.php9
it,minervamedica)/index2.t8
ch,gesundheitsfoerderung)/ueber-uns/downloads.html6
com,inderscience)/browse/index.php6
nl,iospress)/6
com,inderscience)/5
QUERY: SELECT surt, COUNT(*) FROM homepage GROUP BY surt ORDER BY COUNT(*) DESC LIMIT 10;
+
diff --git a/reports/journal_summary.md b/reports/journal_summary.md new file mode 100644 index 0000000..71ac9f5 --- /dev/null +++ b/reports/journal_summary.md @@ -0,0 +1,134 @@ + +# Fatcat "Chocula" Journal Metadata Summary + +This report is auto-generated from a sqlite database file, which should be available/included. + +```sql +SELECT datetime('now'); +``` + +Note that pretty much all of the fatcat release stats are on a *release*, not +*work* basis, so there may be over-counting. Also, as of July 2019 there were +over 1.5 million OA longtail releases which are *not* linked to a container +(journal). + +### Basics + +Top countries by journal count (and fatcat release counts): + +```sql +SELECT country, COUNT(*) AS journal_count, sum(release_count) from journal group by country order by count(*) desc limit 10; +``` + +Top languages by journal count (and fatcat release counts): + +```sql +SELECT lang, COUNT(*) as journal_count, sum(release_count) as release_count FROM journal GROUP BY lang ORDER BY COUNT(*) DESC LIMIT 10; +``` + +Aggregate fatcat fulltext release coverage by OA status: + +```sql +SELECT is_oa, COUNT(*) AS journal_count, SUM(release_count), SUM(ia_count), ROUND(1. * SUM(ia_count) / SUM(release_count), 2) as total_ia_frac FROM journal GROUP BY is_oa; +``` + +### Publisher Segmentation + +Big publishers by journal count: + +```sql +SELECT publisher, COUNT(*) AS journal_count, SUM(release_count) from journal GROUP BY publisher ORDER BY COUNT(*) DESC LIMIT 15; +``` + +Number of publishers with 3 or fewer journals: + +```sql +SELECT COUNT(*) FROM (SELECT publisher, COUNT(*) as journal_count FROM journal GROUP BY publisher) WHERE journal_count <= 3; +``` + +Fulltext coverage by publisher type: + +```sql +SELECT publisher_type, ROUND(1.0 * SUM(ia_count) / SUM(release_count), 2) as ia_total_frac, ROUND(1.0 * SUM(preserved_count) / SUM(release_count), 2) as preserved_total_frac, count(*) as journal_count, sum(release_count) as paper_count from journal group by publisher_type order by sum(release_count) desc; +``` + +Fulltext coverage by publisher type (NOTE: averaging fractions without weighing by release count, intentionally): + +```sql +SELECT publisher_type, ROUND(1.0 * AVG(ia_frac), 2) as avg_ia_frac, ROUND(1.0 * AVG(preserved_frac), 2) as avg_preserved_frac, count(*) as journal_count, sum(release_count) as paper_count from journal group by publisher_type order by sum(release_count) DESC; +``` + +Number of journals with no releases (metadata or fulltext) in fatcat: + +```sql +SELECT publisher_type, COUNT(*) AS journals_with_no_releases FROM journal WHERE release_count = 0 GROUP BY publisher_type ORDER BY COUNT(*) DESC; +``` + +### IA Fulltext Coverage + +Coverage by sherpa color: + +```sql +SELECT sherpa_color, SUM(ia_count) as ia_fulltext_count, SUM(release_count) as release_count, ROUND(1.0 * SUM(ia_count) / SUM(release_count), 2) as total_ia_frac FROM journal GROUP BY sherpa_color; +``` + +Top publishers with very little IA coverage (NOTE: averaging fractions without weight by journal size): + +```sql +SELECT publisher, count(*) as journal_count, ROUND(avg(ia_frac),3) from journal where ia_frac < 0.05 group by publisher order by count(*) desc limit 10; +``` + +### Homepages + +Journal counts by homepage status: + +```sql +SELECT any_homepage, any_live_homepage, any_gwb_homepage, COUNT(*), ROUND(1.0 * COUNT(*) / (SELECT COUNT(*) FROM journal), 2) AS frac FROM journal GROUP BY any_homepage, any_live_homepage, any_gwb_homepage; +``` + +Number of unique journals that have a homepage pointing to wayback or archive.org: + +```sql +SELECT COUNT(DISTINCT issnl) FROM homepage WHERE domain = 'archive.org'; +``` + +Top publishers that have journals in wayback: + +```sql +SELECT publisher, COUNT(*) FROM journal LEFT JOIN homepage ON journal.issnl = homepage.issnl WHERE homepage.domain = 'archive.org' GROUP BY journal.publisher ORDER BY COUNT(*) DESC LIMIT 10; +``` + +Homepage URL counts: + +```sql +SELECT COUNT(*) as rows, COUNT(DISTINCT issnl) as issnls, COUNT(DISTINCT surt) as surts FROM homepage; +``` + +Journals with most unique SURTs: + +```sql +SELECT issnl, COUNT(*) from homepage GROUP BY issnl ORDER BY COUNT(*) DESC LIMIT 10; +``` + + +Blocked domains: + +```sql +SELECT domain, count(*), sum(blocked) from homepage group by domain order by sum(blocked) desc limit 20; +``` + +Top duplicated URLs and SURTs: + +```sql +SELECT url, COUNT(*) FROM homepage GROUP BY url ORDER BY COUNT(*) DESC LIMIT 10; +``` + +Top terminal URLs catch cases where many URLs redirect to a single page: + +```sql +SELECT terminal_url, COUNT(DISTINCT issnl) FROM homepage WHERE terminal_url IS NOT NULL GROUP BY terminal_url ORDER BY COUNT(DISTINCT issnl) DESC LIMIT 20; +``` + +```sql +SELECT surt, COUNT(*) FROM homepage GROUP BY surt ORDER BY COUNT(*) DESC LIMIT 10; +``` -- cgit v1.2.3