diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-12-24 16:35:45 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-02 18:12:58 -0800 |
commit | b603a43e45eb3afa01efd0902c5af56f29d979a2 (patch) | |
tree | 2060d21d15416c50c8e723191f85d6d1315f7fab | |
parent | 9beb3caee51c6bb0403c658a71c965dde4c8e55b (diff) | |
download | sandcrawler-b603a43e45eb3afa01efd0902c5af56f29d979a2.tar.gz sandcrawler-b603a43e45eb3afa01efd0902c5af56f29d979a2.zip |
SQL docs update for diesel change
-rw-r--r-- | sql/README.md | 47 | ||||
-rw-r--r-- | sql/example.env | 1 |
2 files changed, 48 insertions, 0 deletions
diff --git a/sql/README.md b/sql/README.md index b171614..38052fd 100644 --- a/sql/README.md +++ b/sql/README.md @@ -5,6 +5,21 @@ No primary storage of anything in this table. Everything should be rapidly re-creatable from dumps, kafka topics (compressed), CDX, petabox metadata, etc. This is a secondary view on all of that. +## Create Database and User + +Create system user with your username like: + + sudo su postgres + createuser -s bnewbold + +Create database using `diesel` tool (see fatcat rust docs for install notes): + + # DANGER: will delete/recreate entire database + diesel database reset + +In the future would probably be better to create a real role/password and +supply these via `DATABASE_URL` env variable. + ## Schema schema/database name is 'sandcrawler' @@ -124,3 +139,35 @@ Questions we might want to answer http get :3030/cdx?url=eq.https://coleccionables.mercadolibre.com.ar/arduino-pdf_Installments_NoInterest_BestSellers_YES http get :3030/file_meta?sha1hex=eq.120582c855a7cc3c70a8527c560d7f27e6027278 +## Parse URLs + +One approach is to do regexes, something like: + + SELECT substring(column_name FROM '[^/]+://([^/]+)/') AS domain_name FROM table_name; + +Eg: + + SELECT DISTINCT(domain), COUNT(domain) + FROM (select substring(base_url FROM '[^/]+://([^/]*)') as domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + GROUP BY domain + ORDER BY COUNT DESC + LIMIT 10; + +Or: + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 10; + +Can also do some quick lookups for a specific domain and protocol like: + + SELECT * + FROM ingest_file_result + WHERE terminal_url LIKE 'https://insights.ovid.com/%' + LIMIT 10; + diff --git a/sql/example.env b/sql/example.env new file mode 100644 index 0000000..3a13689 --- /dev/null +++ b/sql/example.env @@ -0,0 +1 @@ +DATABASE_URL="postgres://fatcat:tactaf@localhost/sandcrawler" |