aboutsummaryrefslogtreecommitdiffstats
path: root/notes/longtail_crawl.txt
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-23 23:15:41 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-23 23:15:41 -0700
commit1aea4911bf336570a1b6b32d75eced523c329ed6 (patch)
treedd479e2207b74a0b1450a34ab166c1a8cb98d062 /notes/longtail_crawl.txt
parent4018c62adc32a88a76f7bd54f1003d58a29fe120 (diff)
downloadchocula-1aea4911bf336570a1b6b32d75eced523c329ed6.tar.gz
chocula-1aea4911bf336570a1b6b32d75eced523c329ed6.zip
update notes about longtail homepage URLs
Diffstat (limited to 'notes/longtail_crawl.txt')
-rw-r--r--notes/longtail_crawl.txt41
1 files changed, 38 insertions, 3 deletions
diff --git a/notes/longtail_crawl.txt b/notes/longtail_crawl.txt
index 9d2fe96..20cdad3 100644
--- a/notes/longtail_crawl.txt
+++ b/notes/longtail_crawl.txt
@@ -1,11 +1,47 @@
+## 2020 Crawl Query
+
.mode tabs
.output longtail_homepage_urls.tsv
SELECT homepage.url, homepage.issnl
- FROM homepage LEFT JOIN journal ON homepage.issnl = journal.issnl
- WHERE homepage.terminal_status_code = 200 AND journal.is_longtail = 1 AND homepage.domain != 'archive.org' AND homepage.host NOT LIKE '%scielo%' AND homepage.domain != 'jst.go.jp' AND homepage.host != 'books.google.com' AND homepage.host != 'www.google.com' AND journal.has_dois = 0;
+ FROM homepage
+ LEFT JOIN journal ON homepage.issnl = journal.issnl
+ WHERE
+ (homepage.terminal_status_code = 200 or homepage.blocked or homepage.terminal_status_code is null)
+ AND homepage.domain != 'archive.org'
+ AND homepage.host NOT LIKE '%scielo%'
+ AND homepage.domain != 'jst.go.jp'
+ AND homepage.host != 'books.google.com'
+ AND homepage.host != 'www.google.com'
+ AND homepage.domain != 'oclc.org'
+ AND homepage.host != 'www.ncbi.nlm.nih.gov'
+ AND homepage.domain != 'umi.com'
+ AND homepage.domain != 'doi.org'
+ AND homepage.host != 'www.thefreelibrary.com'
+ AND (journal.is_longtail = 1
+ OR journal.publisher_type = 'society'
+ OR journal.publisher_type = 'unipress'
+ OR journal.publisher_type IS NULL)
+ AND (journal.has_dois = 0 or journal.release_count < 20);
+
+## Older 2019 Crawl Query
+
+ .mode tabs
+ .output longtail_homepage_urls.tsv
+ SELECT homepage.url, homepage.issnl
+ FROM homepage
+ LEFT JOIN journal ON homepage.issnl = journal.issnl
+ WHERE
+ homepage.terminal_status_code = 200
+ AND journal.is_longtail = 1
+ AND homepage.domain != 'archive.org'
+ AND homepage.host NOT LIKE '%scielo%'
+ AND homepage.domain != 'jst.go.jp'
+ AND homepage.host != 'books.google.com'
+ AND homepage.host != 'www.google.com'
+ AND journal.has_dois = 0;
## Test Queries
@@ -13,7 +49,6 @@
SELECT ... FROM homepage LEFT JOIN journal ON homepage.issnl = journal.issnl WHERE homepage.terminal_status_code = 200 AND journal.is_longtail = 1;
-
SELECT homepage.domain, COUNT(*) FROM homepage LEFT JOIN journal ON homepage.issnl = journal.issnl WHERE homepage.terminal_status_code = 200 AND journal.is_longtail = 1 AND homepage.domain != 'archive.org' AND homepage.host NOT LIKE '%scielo%' AND homepage.domain != 'jst.go.jp' AND homepage.host != 'books.google.com' AND homepage.host != 'www.google.com' AND journal.has_dois = 0
GROUP BY homepage.domain ORDER BY COUNT(*) DESC LIMIT 20;