aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-23 23:15:41 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-23 23:15:41 -0700
commit1aea4911bf336570a1b6b32d75eced523c329ed6 (patch)
treedd479e2207b74a0b1450a34ab166c1a8cb98d062
parent4018c62adc32a88a76f7bd54f1003d58a29fe120 (diff)
downloadchocula-1aea4911bf336570a1b6b32d75eced523c329ed6.tar.gz
chocula-1aea4911bf336570a1b6b32d75eced523c329ed6.zip
update notes about longtail homepage URLs
-rw-r--r--notes/explore.md74
-rw-r--r--notes/longtail_crawl.txt41
2 files changed, 112 insertions, 3 deletions
diff --git a/notes/explore.md b/notes/explore.md
new file mode 100644
index 0000000..6193309
--- /dev/null
+++ b/notes/explore.md
@@ -0,0 +1,74 @@
+
+## 2020-06-23
+
+Where do back ISSN-Ls come from? Answer: exiting fatcat metadata.
+
+ select count(*) from journal where valid_issnl = 0;
+ => 4
+
+ select count(*) from journal where known_issnl = 0;
+ => 2304
+
+ select directory.slug, count(*) from journal join directory on journal.issnl = directory.issnl where journal.known_issnl = 0 group by directory.slug order by count(*) desc limit 20;
+
+ select count(*) from journal join fatcat_container on journal.issnl = fatcat_container.issnl where journal.known_issnl = 0;
+ => 2,328
+ => note: still a few dupe ISSN-L in fatcat_container
+
+
+How many journals might be longtail but have just a handful of DOIs? And would
+be lost if we filter by `has_doi=0`?
+
+ select count(*) from journal where has_dois = 1 and release_count <= 10;
+ 9054
+
+ select count(*) from journal where has_dois = 1 and is_longtail = 1;
+ 15,575
+
+ select count(*) from journal where has_dois = 1 and (is_active is null or is_active = 1) and release_count <= 10 and any_homepage = 1;
+ => 5,843
+
+How many *journals* would old query turn up? NOTE: have not verified new homepage URLs
+
+ SELECT COUNT(DISTINCT journal.issnl), COUNT(DISTINCT homepage.url)
+ FROM homepage
+ LEFT JOIN journal ON homepage.issnl = journal.issnl
+ WHERE
+ homepage.terminal_status_code = 200
+ AND journal.is_longtail = 1
+ AND homepage.domain != 'archive.org'
+ AND homepage.host NOT LIKE '%scielo%'
+ AND homepage.domain != 'jst.go.jp'
+ AND homepage.host != 'books.google.com'
+ AND homepage.host != 'www.google.com'
+ AND journal.has_dois = 0;
+ => 16,471 journals
+ => 19,460 URLs
+
+New tweaks:
+
+ SELECT COUNT(DISTINCT journal.issnl), COUNT(DISTINCT homepage.url)
+ FROM homepage
+ LEFT JOIN journal ON homepage.issnl = journal.issnl
+ WHERE
+ (homepage.terminal_status_code = 200 or homepage.blocked or homepage.terminal_status_code is null)
+ AND homepage.domain != 'archive.org'
+ AND homepage.host NOT LIKE '%scielo%'
+ AND homepage.domain != 'jst.go.jp'
+ AND homepage.host != 'books.google.com'
+ AND homepage.host != 'www.google.com'
+ AND homepage.domain != 'oclc.org'
+ AND homepage.host != 'www.ncbi.nlm.nih.gov'
+ AND homepage.domain != 'umi.com'
+ AND homepage.domain != 'doi.org'
+ AND homepage.host != 'www.thefreelibrary.com'
+ AND (journal.is_longtail = 1
+ OR journal.publisher_type = 'society'
+ OR journal.publisher_type = 'unipress'
+ OR journal.publisher_type IS NULL)
+ AND (journal.has_dois = 0 or journal.release_count < 20);
+ => 57,637 journals
+ => 70,770 homepages
+
+This is a significant increase in size over previous crawls!
+
diff --git a/notes/longtail_crawl.txt b/notes/longtail_crawl.txt
index 9d2fe96..20cdad3 100644
--- a/notes/longtail_crawl.txt
+++ b/notes/longtail_crawl.txt
@@ -1,11 +1,47 @@
+## 2020 Crawl Query
+
.mode tabs
.output longtail_homepage_urls.tsv
SELECT homepage.url, homepage.issnl
- FROM homepage LEFT JOIN journal ON homepage.issnl = journal.issnl
- WHERE homepage.terminal_status_code = 200 AND journal.is_longtail = 1 AND homepage.domain != 'archive.org' AND homepage.host NOT LIKE '%scielo%' AND homepage.domain != 'jst.go.jp' AND homepage.host != 'books.google.com' AND homepage.host != 'www.google.com' AND journal.has_dois = 0;
+ FROM homepage
+ LEFT JOIN journal ON homepage.issnl = journal.issnl
+ WHERE
+ (homepage.terminal_status_code = 200 or homepage.blocked or homepage.terminal_status_code is null)
+ AND homepage.domain != 'archive.org'
+ AND homepage.host NOT LIKE '%scielo%'
+ AND homepage.domain != 'jst.go.jp'
+ AND homepage.host != 'books.google.com'
+ AND homepage.host != 'www.google.com'
+ AND homepage.domain != 'oclc.org'
+ AND homepage.host != 'www.ncbi.nlm.nih.gov'
+ AND homepage.domain != 'umi.com'
+ AND homepage.domain != 'doi.org'
+ AND homepage.host != 'www.thefreelibrary.com'
+ AND (journal.is_longtail = 1
+ OR journal.publisher_type = 'society'
+ OR journal.publisher_type = 'unipress'
+ OR journal.publisher_type IS NULL)
+ AND (journal.has_dois = 0 or journal.release_count < 20);
+
+## Older 2019 Crawl Query
+
+ .mode tabs
+ .output longtail_homepage_urls.tsv
+ SELECT homepage.url, homepage.issnl
+ FROM homepage
+ LEFT JOIN journal ON homepage.issnl = journal.issnl
+ WHERE
+ homepage.terminal_status_code = 200
+ AND journal.is_longtail = 1
+ AND homepage.domain != 'archive.org'
+ AND homepage.host NOT LIKE '%scielo%'
+ AND homepage.domain != 'jst.go.jp'
+ AND homepage.host != 'books.google.com'
+ AND homepage.host != 'www.google.com'
+ AND journal.has_dois = 0;
## Test Queries
@@ -13,7 +49,6 @@
SELECT ... FROM homepage LEFT JOIN journal ON homepage.issnl = journal.issnl WHERE homepage.terminal_status_code = 200 AND journal.is_longtail = 1;
-
SELECT homepage.domain, COUNT(*) FROM homepage LEFT JOIN journal ON homepage.issnl = journal.issnl WHERE homepage.terminal_status_code = 200 AND journal.is_longtail = 1 AND homepage.domain != 'archive.org' AND homepage.host NOT LIKE '%scielo%' AND homepage.domain != 'jst.go.jp' AND homepage.host != 'books.google.com' AND homepage.host != 'www.google.com' AND journal.has_dois = 0
GROUP BY homepage.domain ORDER BY COUNT(*) DESC LIMIT 20;