aboutsummaryrefslogtreecommitdiffstats
path: root/sql
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-03-02 15:55:53 -0800
committerBryan Newbold <bnewbold@archive.org>2020-03-02 15:55:53 -0800
commit6d41261ac417c61a61d0c794fa07639f454bcd52 (patch)
tree6f1e3fa3f7a2574511fde3716cf89d48cd8978c7 /sql
parent214c35db713ee5f34e9c6e8aa24df6456addcece (diff)
downloadsandcrawler-6d41261ac417c61a61d0c794fa07639f454bcd52.tar.gz
sandcrawler-6d41261ac417c61a61d0c794fa07639f454bcd52.zip
more SQL queries
Diffstat (limited to 'sql')
-rw-r--r--sql/random_queries.md57
1 files changed, 57 insertions, 0 deletions
diff --git a/sql/random_queries.md b/sql/random_queries.md
index 98b32f9..d88f45b 100644
--- a/sql/random_queries.md
+++ b/sql/random_queries.md
@@ -120,3 +120,60 @@ Can also do some quick lookups for a specific domain and protocol like:
FROM ingest_file_result
WHERE terminal_url LIKE 'https://insights.ovid.com/%'
LIMIT 10;
+
+## Bulk Ingest
+
+Show bulk ingest status on links *added* in the past week:
+
+ SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL
+ AND ingest_request.link_source = 'unpaywall'
+ GROUP BY ingest_file_result.ingest_type, ingest_file_result.status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+Top *successful* domains:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '7 day'::INTERVAL
+ AND ingest_request.link_source = 'unpaywall'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status = 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+Summarize non-success domains for the same:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.created >= NOW() - '7 day'::INTERVAL
+ AND ingest_request.link_source = 'unpaywall'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 20;