diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-10-03 10:16:26 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-10-03 10:16:26 -0700 |
commit | 54e14814080d9a706ff6f15694b3b54918200169 (patch) | |
tree | c2ce152acb64a365408a57728c18d960b954c1f5 /sql/dump_reingest_terminalstatus.sql | |
parent | a04468041cd81ad90aa76ec15788a5ffacb6eec2 (diff) | |
download | sandcrawler-54e14814080d9a706ff6f15694b3b54918200169.tar.gz sandcrawler-54e14814080d9a706ff6f15694b3b54918200169.zip |
reingests: update scripts and SQL
Diffstat (limited to 'sql/dump_reingest_terminalstatus.sql')
-rw-r--r-- | sql/dump_reingest_terminalstatus.sql | 34 |
1 files changed, 34 insertions, 0 deletions
diff --git a/sql/dump_reingest_terminalstatus.sql b/sql/dump_reingest_terminalstatus.sql new file mode 100644 index 0000000..b72a096 --- /dev/null +++ b/sql/dump_reingest_terminalstatus.sql @@ -0,0 +1,34 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + ingest_file_result.hit = false + AND ingest_request.created < NOW() - '72 hour'::INTERVAL + AND ingest_request.created > NOW() - '10 day'::INTERVAL + AND (ingest_request.ingest_request_source = 'fatcat-changelog' + OR ingest_request.ingest_request_source = 'fatcat-ingest') + AND ingest_file_result.status = 'terminal-bad-status' + AND ( + ingest_file_result.terminal_status_code = 500 + OR ingest_file_result.terminal_status_code = 502 + OR ingest_file_result.terminal_status_code = 503 + OR ingest_file_result.terminal_status_code = 429 + OR ingest_file_result.terminal_status_code = 404 + ) + AND ( + ingest_request.base_url LIKE 'https://doi.org/10.3390/%' + OR ingest_request.base_url LIKE 'https://doi.org/10.1103/%' + OR ingest_request.base_url LIKE 'https://doi.org/10.1155/%' + ) +) TO '/srv/sandcrawler/tasks/reingest_terminalstatus_current.rows.json'; + +-- bulk re-tries would be: +-- AND (ingest_request.ingest_request_source != 'fatcat-changelog' +-- AND ingest_request.ingest_request_source != 'fatcat-ingest') + +ROLLBACK; |