From 043b35040e4385c674267aa88c4056bdfdd9cb6c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 3 Sep 2020 18:27:44 -0700 Subject: update notes and explore --- TODO.md | 16 +++++++++++++++- notes/explore.md | 11 +++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/TODO.md b/TODO.md index 29b1fe0..befbd48 100644 --- a/TODO.md +++ b/TODO.md @@ -1,4 +1,5 @@ + priorities: - coverage stats, particularly for longtail - `is_active` coverage @@ -10,9 +11,22 @@ priorities: ## Sources -- unpaywall journal-level classification +- preservation coverage + x hathitrust (huge!) + https://www.hathitrust.org/hathifiles_description + x PKP PLN (ONIX) + https://pkp.sfu.ca/pkp-pn/ + http://pkp.sfu.ca/files/pkppn/onix.csv + => Scholars Portal (canada) + received ONIX XML, hoping for KBART format + => Cariniana + => National Digital Preservation Program, China + => Library of Congress +- additional hathitrust (many more ISSNs/journals) +- unpaywall journal-level classification (OA color) => ask for journal-level dump or do munging - jurn matches + => somebody on github did an openrefine match - public scopus list (?) - scrape/munge public clarivate dumps - repositories (?) diff --git a/notes/explore.md b/notes/explore.md index 5f23d35..c25404d 100644 --- a/notes/explore.md +++ b/notes/explore.md @@ -12,6 +12,17 @@ PKP PLN numbers result in? So about 60k releases. +How about Hathitrust? + + select count(*), sum(journal.release_count), sum(journal.preserved_count) from journal join directory on journal.issnl = directory.issnl where directory.slug = 'hathitrust'; + + count(*) sum(journal.release_count) sum(journal.preserved_count) + ---------- -------------------------- ---------------------------- + 26628 48160184 36905342 + +Much larger potential impact, of 11+ million releases, though unclear how many +are acutally in the hathitrust archives. + ## 2020-06-23 Where do back ISSN-Ls come from? Answer: exiting fatcat metadata. -- cgit v1.2.3