From 8fe71d3395e6d4d0aa2850945dda73bd82d57bed Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 25 Jun 2020 16:33:20 -0700 Subject: commit old notes on a one-off CDX table cleanup --- notes/tasks/2020-01-27_cleanup_cdx.md | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 notes/tasks/2020-01-27_cleanup_cdx.md (limited to 'notes/tasks') diff --git a/notes/tasks/2020-01-27_cleanup_cdx.md b/notes/tasks/2020-01-27_cleanup_cdx.md new file mode 100644 index 0000000..54db92e --- /dev/null +++ b/notes/tasks/2020-01-27_cleanup_cdx.md @@ -0,0 +1,34 @@ + +Accidentally seem to have backfilled many CDX lines with non-PDF content. +Should clear these out! + +Something like: + + mimetype = 'text/html' + not in file_meta + +Or maybe instead: + + mimetype = 'text/html' + not in file_meta + +SQL: + + SELECT * FROM cdx WHERE mimetype = 'text/html' AND row_created < '2019-10-01' LIMIT 5; + SELECT COUNT(1) FROM cdx WHERE mimetype = 'text/html' AND row_created < '2019-10-01'; + => 24841846 + + SELECT * FROM cdx LEFT JOIN file_meta ON file_meta.sha1hex = cdx.sha1hex WHERE cdx.mimetype = 'text/html' AND file_meta.sha256hex IS NULL LIMIT 5; + SELECT COUNT(1) FROM cdx LEFT JOIN file_meta ON cdx.sha1hex = file_meta.sha1hex WHERE cdx.mimetype = 'text/html' AND file_meta.sha256hex IS NULL; + => 24547552 + + DELETE FROM cdx + WHERE sha1hex IN + (SELECT cdx.sha1hex + FROM cdx + LEFT JOIN file_meta ON file_meta.sha1hex = cdx.sha1hex + WHERE cdx.mimetype = 'text/html' AND file_meta.sha256hex IS NULL); + => DELETE 24553428 + +Slightly more... probably should have had a "AND cdx.mimetype = 'text/html'" in +the DELETE WHERE clause. -- cgit v1.2.3