From 1aea4911bf336570a1b6b32d75eced523c329ed6 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 23 Jun 2020 23:15:41 -0700 Subject: update notes about longtail homepage URLs --- notes/explore.md | 74 ++++++++++++++++++++++++++++++++++++++++++++++++ notes/longtail_crawl.txt | 41 +++++++++++++++++++++++++-- 2 files changed, 112 insertions(+), 3 deletions(-) create mode 100644 notes/explore.md diff --git a/notes/explore.md b/notes/explore.md new file mode 100644 index 0000000..6193309 --- /dev/null +++ b/notes/explore.md @@ -0,0 +1,74 @@ + +## 2020-06-23 + +Where do back ISSN-Ls come from? Answer: exiting fatcat metadata. + + select count(*) from journal where valid_issnl = 0; + => 4 + + select count(*) from journal where known_issnl = 0; + => 2304 + + select directory.slug, count(*) from journal join directory on journal.issnl = directory.issnl where journal.known_issnl = 0 group by directory.slug order by count(*) desc limit 20; + + select count(*) from journal join fatcat_container on journal.issnl = fatcat_container.issnl where journal.known_issnl = 0; + => 2,328 + => note: still a few dupe ISSN-L in fatcat_container + + +How many journals might be longtail but have just a handful of DOIs? And would +be lost if we filter by `has_doi=0`? + + select count(*) from journal where has_dois = 1 and release_count <= 10; + 9054 + + select count(*) from journal where has_dois = 1 and is_longtail = 1; + 15,575 + + select count(*) from journal where has_dois = 1 and (is_active is null or is_active = 1) and release_count <= 10 and any_homepage = 1; + => 5,843 + +How many *journals* would old query turn up? NOTE: have not verified new homepage URLs + + SELECT COUNT(DISTINCT journal.issnl), COUNT(DISTINCT homepage.url) + FROM homepage + LEFT JOIN journal ON homepage.issnl = journal.issnl + WHERE + homepage.terminal_status_code = 200 + AND journal.is_longtail = 1 + AND homepage.domain != 'archive.org' + AND homepage.host NOT LIKE '%scielo%' + AND homepage.domain != 'jst.go.jp' + AND homepage.host != 'books.google.com' + AND homepage.host != 'www.google.com' + AND journal.has_dois = 0; + => 16,471 journals + => 19,460 URLs + +New tweaks: + + SELECT COUNT(DISTINCT journal.issnl), COUNT(DISTINCT homepage.url) + FROM homepage + LEFT JOIN journal ON homepage.issnl = journal.issnl + WHERE + (homepage.terminal_status_code = 200 or homepage.blocked or homepage.terminal_status_code is null) + AND homepage.domain != 'archive.org' + AND homepage.host NOT LIKE '%scielo%' + AND homepage.domain != 'jst.go.jp' + AND homepage.host != 'books.google.com' + AND homepage.host != 'www.google.com' + AND homepage.domain != 'oclc.org' + AND homepage.host != 'www.ncbi.nlm.nih.gov' + AND homepage.domain != 'umi.com' + AND homepage.domain != 'doi.org' + AND homepage.host != 'www.thefreelibrary.com' + AND (journal.is_longtail = 1 + OR journal.publisher_type = 'society' + OR journal.publisher_type = 'unipress' + OR journal.publisher_type IS NULL) + AND (journal.has_dois = 0 or journal.release_count < 20); + => 57,637 journals + => 70,770 homepages + +This is a significant increase in size over previous crawls! + diff --git a/notes/longtail_crawl.txt b/notes/longtail_crawl.txt index 9d2fe96..20cdad3 100644 --- a/notes/longtail_crawl.txt +++ b/notes/longtail_crawl.txt @@ -1,11 +1,47 @@ +## 2020 Crawl Query + .mode tabs .output longtail_homepage_urls.tsv SELECT homepage.url, homepage.issnl - FROM homepage LEFT JOIN journal ON homepage.issnl = journal.issnl - WHERE homepage.terminal_status_code = 200 AND journal.is_longtail = 1 AND homepage.domain != 'archive.org' AND homepage.host NOT LIKE '%scielo%' AND homepage.domain != 'jst.go.jp' AND homepage.host != 'books.google.com' AND homepage.host != 'www.google.com' AND journal.has_dois = 0; + FROM homepage + LEFT JOIN journal ON homepage.issnl = journal.issnl + WHERE + (homepage.terminal_status_code = 200 or homepage.blocked or homepage.terminal_status_code is null) + AND homepage.domain != 'archive.org' + AND homepage.host NOT LIKE '%scielo%' + AND homepage.domain != 'jst.go.jp' + AND homepage.host != 'books.google.com' + AND homepage.host != 'www.google.com' + AND homepage.domain != 'oclc.org' + AND homepage.host != 'www.ncbi.nlm.nih.gov' + AND homepage.domain != 'umi.com' + AND homepage.domain != 'doi.org' + AND homepage.host != 'www.thefreelibrary.com' + AND (journal.is_longtail = 1 + OR journal.publisher_type = 'society' + OR journal.publisher_type = 'unipress' + OR journal.publisher_type IS NULL) + AND (journal.has_dois = 0 or journal.release_count < 20); + +## Older 2019 Crawl Query + + .mode tabs + .output longtail_homepage_urls.tsv + SELECT homepage.url, homepage.issnl + FROM homepage + LEFT JOIN journal ON homepage.issnl = journal.issnl + WHERE + homepage.terminal_status_code = 200 + AND journal.is_longtail = 1 + AND homepage.domain != 'archive.org' + AND homepage.host NOT LIKE '%scielo%' + AND homepage.domain != 'jst.go.jp' + AND homepage.host != 'books.google.com' + AND homepage.host != 'www.google.com' + AND journal.has_dois = 0; ## Test Queries @@ -13,7 +49,6 @@ SELECT ... FROM homepage LEFT JOIN journal ON homepage.issnl = journal.issnl WHERE homepage.terminal_status_code = 200 AND journal.is_longtail = 1; - SELECT homepage.domain, COUNT(*) FROM homepage LEFT JOIN journal ON homepage.issnl = journal.issnl WHERE homepage.terminal_status_code = 200 AND journal.is_longtail = 1 AND homepage.domain != 'archive.org' AND homepage.host NOT LIKE '%scielo%' AND homepage.domain != 'jst.go.jp' AND homepage.host != 'books.google.com' AND homepage.host != 'www.google.com' AND journal.has_dois = 0 GROUP BY homepage.domain ORDER BY COUNT(*) DESC LIMIT 20; -- cgit v1.2.3