diff options
Diffstat (limited to 'notes')
-rw-r--r-- | notes/tasks/2020-01-27_cleanup_cdx.md | 34 |
1 files changed, 34 insertions, 0 deletions
diff --git a/notes/tasks/2020-01-27_cleanup_cdx.md b/notes/tasks/2020-01-27_cleanup_cdx.md new file mode 100644 index 0000000..54db92e --- /dev/null +++ b/notes/tasks/2020-01-27_cleanup_cdx.md @@ -0,0 +1,34 @@ + +Accidentally seem to have backfilled many CDX lines with non-PDF content. +Should clear these out! + +Something like: + + mimetype = 'text/html' + not in file_meta + +Or maybe instead: + + mimetype = 'text/html' + not in file_meta + +SQL: + + SELECT * FROM cdx WHERE mimetype = 'text/html' AND row_created < '2019-10-01' LIMIT 5; + SELECT COUNT(1) FROM cdx WHERE mimetype = 'text/html' AND row_created < '2019-10-01'; + => 24841846 + + SELECT * FROM cdx LEFT JOIN file_meta ON file_meta.sha1hex = cdx.sha1hex WHERE cdx.mimetype = 'text/html' AND file_meta.sha256hex IS NULL LIMIT 5; + SELECT COUNT(1) FROM cdx LEFT JOIN file_meta ON cdx.sha1hex = file_meta.sha1hex WHERE cdx.mimetype = 'text/html' AND file_meta.sha256hex IS NULL; + => 24547552 + + DELETE FROM cdx + WHERE sha1hex IN + (SELECT cdx.sha1hex + FROM cdx + LEFT JOIN file_meta ON file_meta.sha1hex = cdx.sha1hex + WHERE cdx.mimetype = 'text/html' AND file_meta.sha256hex IS NULL); + => DELETE 24553428 + +Slightly more... probably should have had a "AND cdx.mimetype = 'text/html'" in +the DELETE WHERE clause. |