From c4caeaedc63c2bb2f9bb9f6be6826480aefc9caf Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 4 Apr 2022 15:27:02 -0700 Subject: sql: fix reingest query missing type on LEFT JOIN; wrap in read-only transaction --- sql/dump_reingest_bulk.sql | 8 +++++++- sql/dump_reingest_quarterly.sql | 7 ++++++- sql/dump_reingest_spn.sql | 8 +++++++- sql/dump_reingest_weekly.sql | 7 ++++++- sql/reingest_weekly.sh | 2 +- 5 files changed, 27 insertions(+), 5 deletions(-) diff --git a/sql/dump_reingest_bulk.sql b/sql/dump_reingest_bulk.sql index 403fb20..698db7a 100644 --- a/sql/dump_reingest_bulk.sql +++ b/sql/dump_reingest_bulk.sql @@ -1,7 +1,11 @@ +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request - LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + LEFT JOIN ingest_file_result ON + ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type WHERE (ingest_request.ingest_type = 'pdf' OR ingest_request.ingest_type = 'html') @@ -23,3 +27,5 @@ COPY ( AND ingest_file_result.status != 'spn2-error:network-authentication-required' AND ingest_file_result.status != 'spn2-error:unknown' ) TO '/srv/sandcrawler/tasks/reingest_bulk_current.rows.json'; + +ROLLBACK; diff --git a/sql/dump_reingest_quarterly.sql b/sql/dump_reingest_quarterly.sql index c425a15..c377bf0 100644 --- a/sql/dump_reingest_quarterly.sql +++ b/sql/dump_reingest_quarterly.sql @@ -1,7 +1,11 @@ +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request - LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + LEFT JOIN ingest_file_result ON + ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type WHERE (ingest_request.ingest_type = 'pdf' OR ingest_request.ingest_type = 'html') @@ -32,3 +36,4 @@ COPY ( -- AND (ingest_request.ingest_request_source != 'fatcat-changelog' -- AND ingest_request.ingest_request_source != 'fatcat-ingest') +ROLLBACK; diff --git a/sql/dump_reingest_spn.sql b/sql/dump_reingest_spn.sql index b0051dd..65a8796 100644 --- a/sql/dump_reingest_spn.sql +++ b/sql/dump_reingest_spn.sql @@ -1,7 +1,11 @@ +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request - LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + LEFT JOIN ingest_file_result ON + ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type WHERE (ingest_request.ingest_type = 'pdf' OR ingest_request.ingest_type = 'html') @@ -26,3 +30,5 @@ COPY ( AND ingest_file_result.status != 'spn2-error:network-authentication-required' AND ingest_file_result.status != 'spn2-error:unknown' ) TO '/srv/sandcrawler/tasks/reingest_spn.rows.json'; + +ROLLBACK; diff --git a/sql/dump_reingest_weekly.sql b/sql/dump_reingest_weekly.sql index e529945..4acec38 100644 --- a/sql/dump_reingest_weekly.sql +++ b/sql/dump_reingest_weekly.sql @@ -1,7 +1,11 @@ +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request - LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + LEFT JOIN ingest_file_result ON + ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type WHERE (ingest_request.ingest_type = 'pdf' OR ingest_request.ingest_type = 'html') @@ -32,3 +36,4 @@ COPY ( -- AND (ingest_request.ingest_request_source != 'fatcat-changelog' -- AND ingest_request.ingest_request_source != 'fatcat-ingest') +ROLLBACK; diff --git a/sql/reingest_weekly.sh b/sql/reingest_weekly.sh index 04ce39d..b60bd0e 100755 --- a/sql/reingest_weekly.sh +++ b/sql/reingest_weekly.sh @@ -13,7 +13,7 @@ sudo -u sandcrawler pipenv run \ cat /srv/sandcrawler/tasks/reingest_weekly_current.json \ | shuf \ - | head -n60000 \ + | head -n80000 \ | jq . -c \ | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 -- cgit v1.2.3