From 9a558d1a8fd4021908c6195de31237a714a41b9d Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 8 Jul 2020 19:53:19 -0700 Subject: update reports --- reports/report.2020-07-09.html | 1226 ++++++++++++++++++++++++++++++++++++++++ reports/report_template.md | 25 +- 2 files changed, 1245 insertions(+), 6 deletions(-) create mode 100644 reports/report.2020-07-09.html diff --git a/reports/report.2020-07-09.html b/reports/report.2020-07-09.html new file mode 100644 index 0000000..629b5bf --- /dev/null +++ b/reports/report.2020-07-09.html @@ -0,0 +1,1226 @@ +

Fatcat "Chocula" Journal Metadata Summary

+

This report is auto-generated from a sqlite database file, which should be available/included.

+
+ + + + + + +
datetime('now')
2020-07-09 02:41:48
QUERY: SELECT datetime('now');
+

Note that pretty much all of the fatcat release stats are on a release, not +work basis, so there may be over-counting. Also, as of July 2019 there were +over 1.5 million OA longtail releases which are not linked to a container +(journal).

+
+ + + + + + + + + + +
seqnamefile
0main/home/bnewbold/code/chocula/chocula.sqlite
QUERY: PRAGMA database_list;
+

Overview

+

Top publishers by journal count:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
publisherCOUNT(*)
50321
Elsevier4909
Springer3180
Taylor & Francis3049
John Wiley & Sons, Inc2325
SAGE Publications1442
J-STAGE1406
Peter Lang International Academic Publishers1356
SciELO1188
Informa UK (Taylor & Francis)738
Springer-Verlag707
Cambridge University Press598
Walter de Gruyter GmbH553
Georg Thieme Verlag KG515
OMICS Publishing Group497
IEEE, Inc483
Medknow Publications473
JSTOR469
Oxford University Press461
Hindawi456
Bentham Science445
De Gruyter Open Sp. z o.o.442
Wolters Kluwer Health427
CAIRN416
Egypts Presidential Specialized Council for Education and Scientific Research402
QUERY: SELECT publisher, COUNT(*)
+FROM journal
+GROUP BY publisher
+ORDER BY COUNT(*) DESC
+LIMIT 25;
+

Top countries by number of journals:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
countryCOUNT(*)
us31066
12064
id10204
de8220
in7491
gb7358
fr6947
uk5988
nl5579
br4783
QUERY: SELECT  country,
+COUNT(*)
+FROM journal
+GROUP BY country
+ORDER BY COUNT(*) DESC
+LIMIT 10;
+

.. by number of papers:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
countryCOUNT(*)SUM(release_count)
us3106632295341
gb735813562766
nl557911027952
de82207555626
jp39245538593
uk59884672277
fr69472211672
ch21841973488
ru33221447208
in74911206194
QUERY: SELECT  country,
+COUNT(*),
+SUM(release_count)
+FROM journal
+GROUP BY country
+ORDER BY SUM(release_count) DESC
+LIMIT 10;
+

Top languages by number of journals:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
langCOUNT(*)
119624
en33516
fr2545
es1986
pt1278
id810
fa705
de687
ja627
ru455
QUERY: SELECT  lang,
+COUNT(*)
+FROM journal
+GROUP BY lang
+ORDER BY COUNT(*) DESC
+LIMIT 10;
+

... by number of papers:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
langCOUNT(*)SUM(release_count)
en3351652300274
11962439741739
de6871081202
ja627701129
fr2545460774
es1986328252
pt1278265893
ru455236151
it365107943
id81064503
QUERY: SELECT  lang,
+COUNT(*),
+SUM(release_count)
+FROM journal
+GROUP BY lang
+ORDER BY SUM(release_count) DESC
+LIMIT 10;
+

Fatcat Fulltext Coverage

+

Fulltext coverage by publisher type:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
publisher_typeAVG(ia_frac)AVG(preserved_frac)journal_countpaper_count
big50.202722434013542670.74521701627304141435839085852
society0.37943770910205450.52176078848653951162217531161
0.285081935244611560.390131784887492457663417473701
unipress0.52557337331844240.71009369484341882386014361
commercial0.33104781934326860.677467640623539959085794436
longtail0.69565292533374490.7448822575700694428145612002
repository0.123404746829803470.240381095642414537651037428
scielo0.81878975159915980.84752833076645411589935919
other0.17723821188839350.629416843982798956852087
archive0.328584779789559630.9868821710035494543727385
oa0.76740001448399340.80211620611885771854669492
QUERY: SELECT  publisher_type,
+AVG(ia_frac),
+AVG(preserved_frac),
+COUNT(*) AS journal_count,
+SUM(release_count) AS paper_count
+FROM journal
+GROUP BY publisher_type
+ORDER BY SUM(release_count) DESC;
+

Top publishers with very little coverage:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
publisherjournal_countAVG(ia_frac)
96410.0017593496094944628
Elsevier18910.017255730382637596
Taylor & Francis10410.026388684277766576
J-STAGE10000.008786544945748286
John Wiley & Sons, Inc7630.021697556560790764
Informa UK (Taylor & Francis)5860.01002503920645208
SAGE Publications5680.018712710467758988
Springer-Verlag3850.015267708054728164
Springer3590.025752583122217714
JSTOR2700.010432032891517822
QUERY: SELECT  publisher,
+COUNT(*) AS journal_count,
+AVG(ia_frac)
+FROM journal
+WHERE ia_frac < 0.05
+GROUP BY publisher
+ORDER BY journal_count DESC
+LIMIT 10;
+

Amount of fulltext by SHERPA/ROMEO journal color::

+
+ + + + + + + + + + + + + + + + + + + + + + + + +
sherpa_colorSUM(ia_count)
8203410
blue1071423
green10304362
white732457
yellow2490476
QUERY: SELECT  sherpa_color,
+SUM(ia_count)
+FROM journal
+GROUP BY sherpa_color;
+

Journal Homepages

+

Homepage URL counts:

+
+ + + + + + + + +
unique_urlsjournals_with_hompages
188588118879
QUERY: SELECT COUNT(DISTINCT surt) as unique_urls, COUNT(DISTINCT issnl) as journals_with_hompages FROM homepage;
+

Journal counts by homepage status:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
any_homepageany_live_homepageany_gwb_homepageCOUNT(*)frac
000464020.28
100128820.08
101102660.06
11087210.05
111870100.53
QUERY: SELECT any_homepage, any_live_homepage, any_gwb_homepage, COUNT(*), ROUND(1.0 * COUNT(*) / (SELECT COUNT(*) FROM journal), 2) AS frac FROM journal GROUP BY any_homepage, any_live_homepage, any_gwb_homepage;
+

Number of unique journals that have a homepage pointing to wayback or archive.org:

+
+ + + + + + +
COUNT(DISTINCT issnl)
1453
QUERY: SELECT COUNT(DISTINCT issnl) FROM homepage WHERE domain = 'archive.org';
+

Journals with the most homepage URLs:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
issnlCOUNT(*)
0036-643921
1487-061416
2375-038316
2374-403015
0097-632614
0749-405X13
1521-909713
0009-700412
0030-707612
0717-554X12
QUERY: SELECT  issnl,
+COUNT(*)
+FROM homepage
+GROUP BY issnl
+ORDER BY COUNT(*) DESC
+LIMIT 10;
+

Top/redundant URLs and SURTs:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
surtCOUNT(*)
com,indianjournals)/80
com,hindawi)/71
au,com,informit,search)/search;res=apaft64
com,umi)/pqdauto51
org,rsc,pubs)/en/ebooks50
com,umi)/proquest48
org,ieee,ieeexplore)/xplore/conferences.jsp40
org,omicsonline)/37
com,idealibrary)/36
com,wiley,interscience)/31
QUERY: SELECT  surt,
+COUNT(*)
+FROM homepage
+GROUP BY surt
+ORDER BY COUNT(*) DESC
+LIMIT 10;
+

What is the deal with all those "benjamins" URLs?

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
publishername
John Benjamins Publishing CompanyNOWELE
Studia Uralo-Altaica
John Benjamins Publishing CompanyLanguage Problems and Language Planning
John Benjamins Publishing CompanyLingvisticæ investigationes
John Benjamins Publishing CompanyLinguistics of the TIbeto-Burman Area
John Benjamins Publishing CompanyPragmatics & Cognition
John Benjamins Publishing CompanyTerminology
John Benjamins Publishing CompanyWritten Language & Literacy
FORUM: Revue internationale d?interprétation et de traduction / International Journal of Interpretation and Translation
John Benjamins Publishing CompanyEnglish Text Construction
John Benjamins Publishing CompanyConstructions and Frames
John Benjamins Publishing CompanyPragmatics and Society
John Benjamins Publishing CompanyTranslation and Interpreting Studies
John Benjamins Publishing CompanyLanguage and Dialogue
John Benjamins Publishing CompanyMetaphor in Language, Cognition, and Communication
Hamburg Studies on Linguistic Diversity
John Benjamins Publishing CompanyTranslation Spaces
Studies in Arabic Linguistics
John Benjamins Publishing CompanyJournal of Immersion and Content-Based Language Education (JICB)
Children's Literature, Culture, and Cognition
John Benjamins Publishing CompanyJournal of Language Aggression and Conflict
FILLM Studies in Languages and Literatures
Advances in Historical Sociolinguistics
John Benjamins Publishing CompanyLinguistic Landscape
John Benjamins Publishing CompanyInternational Journal of Learner Corpus Research
John Benjamins Publishing CompanyJournal of Second Language Pronunciation
ITL - International Journal of Applied Linguistics
John Benjamins Publishing CompanyCognitive Individual Differences in Second Language Processing and Acquisition
John Benjamins Publishing CompanyFORUM
John Benjamins Publishing CompanyStudies in Germanic Linguistics
QUERY: SELECT  publisher,
+name
+FROM journal
+LEFT JOIN homepage ON journal.issnl = homepage.issnl
+WHERE homepage.surt = 'com,benjamins)/';
+

Domains that block us:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
domainjournal_homepagesSUM(blocked)
jstor.org76747507
tandfonline.com45684505
wiley.com4289721
informahealthcare.com221220
brill.nl234164
bentham.org152149
computer.org14364
ucpress.edu6459
dekker.com4847
uem.br4942
maney.co.uk4141
ingentaconnect.com41731
heldref.org2525
amcity.com2323
managementjournals.com1919
ucpressjournals.com1919
ametsoc.org3218
mdconsult.com2717
ikpress.org1816
rodopi.nl2016
QUERY: SELECT  domain,
+COUNT(*) as journal_homepages,
+SUM(blocked)
+FROM homepage
+GROUP BY domain
+ORDER BY SUM(blocked) DESC
+LIMIT 20;
+

Top duplicated domains:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
urlCOUNT(*)
http://www.indianjournals.com/73
http://www.hindawi.com/70
http://search.informit.com.au/search;res=APAFT60
http://www.umi.com/proquest46
http://www.umi.com/pqdauto/45
http://ieeexplore.ieee.org/Xplore/conferences.jsp40
http://omicsonline.org/36
http://www.idealibrary.com/36
http://ieeexplore.ieee.org/xpl/conferences.jsp24
http://www.metapress.com/24
http://www.randspublications.org/22
http://www.studia.ubbcluj.ro/serii/index_en.html22
http://find.galegroup.com/ips/publicationSearch.do21
http://jurnal.unimed.ac.id/21
http://www.bioinfo.in/journals.php20
http://www.interscience.wiley.com/20
http://www.commongroundpublishing.com/19
http://www.haworthpress.com/19
http://www.heinonline.org/19
http://www.infosci-journals.com/19
QUERY: SELECT  url,
+COUNT(*)
+FROM homepage
+GROUP BY url
+ORDER BY COUNT(*) DESC
+LIMIT 20;
+

Number of journals with a homepage that points to web.archive.org or archive.org:

+
+ + + + + + +
COUNT(DISTINCT issnl)
1453
QUERY: SELECT COUNT(DISTINCT issnl)
+FROM homepage
+WHERE domain = 'archive.org';
+

Top publishers that have journals in wayback:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
publisherCOUNT(*)
653
EDP Sciences23
CAIRN18
OpenEdition18
Elsevier6
Springer6
PERSEE Program5
Peer Community In5
Institut de recherche et d'histoire des textes (France)4
San Lucas Medical4
QUERY: SELECT  publisher,
+COUNT(*)
+FROM journal
+LEFT JOIN homepage ON journal.issnl = homepage.issnl
+WHERE homepage.domain = 'archive.org'
+GROUP BY journal.publisher
+ORDER BY COUNT(*) DESC
+LIMIT 10;
+

Top publishers by number of journals missing a homepage:

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
publisherCOUNT(*)
21460
Peter Lang International Academic Publishers1270
Elsevier876
J-STAGE864
Egypts Presidential Specialized Council for Education and Scientific Research354
Georg Thieme Verlag KG288
Al Manhal FZ, LLC216
Informa UK (Taylor & Francis)202
Springer-Verlag156
ELSEVIER LTD145
Inderscience122
African Journals Online121
Diva Enterprises Private Limited119
PERSEE Program118
Sabinet109
SAGE Publications103
Brill99
Superintendent of Government Documents99
Taylor & Francis98
Bentham Science94
QUERY: SELECT  publisher,
+COUNT(*)
+FROM journal
+WHERE any_homepage=0
+GROUP BY publisher
+ORDER BY COUNT(*) DESC
+LIMIT 20;
+
diff --git a/reports/report_template.md b/reports/report_template.md index ac98649..ad64c5d 100644 --- a/reports/report_template.md +++ b/reports/report_template.md @@ -1,16 +1,17 @@ - - -# Chocula Journal Aggregate Stats +# Fatcat "Chocula" Journal Metadata Summary +This report is auto-generated from a sqlite database file, which should be available/included. ```sql SELECT datetime('now'); ``` +Note that pretty much all of the fatcat release stats are on a *release*, not +*work* basis, so there may be over-counting. Also, as of July 2019 there were +over 1.5 million OA longtail releases which are *not* linked to a container +(journal). + ```sql PRAGMA database_list; ``` @@ -118,6 +119,18 @@ Homepage URL counts: SELECT COUNT(DISTINCT surt) as unique_urls, COUNT(DISTINCT issnl) as journals_with_hompages FROM homepage; ``` +Journal counts by homepage status: + +```sql +SELECT any_homepage, any_live_homepage, any_gwb_homepage, COUNT(*), ROUND(1.0 * COUNT(*) / (SELECT COUNT(*) FROM journal), 2) AS frac FROM journal GROUP BY any_homepage, any_live_homepage, any_gwb_homepage; +``` + +Number of unique journals that have a homepage pointing to wayback or archive.org: + +```sql +SELECT COUNT(DISTINCT issnl) FROM homepage WHERE domain = 'archive.org'; +``` + Journals with the most homepage URLs: ```sql -- cgit v1.2.3