aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-25 16:33:20 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-25 16:33:20 -0700
commit8fe71d3395e6d4d0aa2850945dda73bd82d57bed (patch)
tree1acedac595aef6174a6174019dde1359b33d63a2
parentb15354bd8e1bb44aa8f774f2d73db8742826c4f6 (diff)
downloadsandcrawler-8fe71d3395e6d4d0aa2850945dda73bd82d57bed.tar.gz
sandcrawler-8fe71d3395e6d4d0aa2850945dda73bd82d57bed.zip
commit old notes on a one-off CDX table cleanup
-rw-r--r--notes/tasks/2020-01-27_cleanup_cdx.md34
1 files changed, 34 insertions, 0 deletions
diff --git a/notes/tasks/2020-01-27_cleanup_cdx.md b/notes/tasks/2020-01-27_cleanup_cdx.md
new file mode 100644
index 0000000..54db92e
--- /dev/null
+++ b/notes/tasks/2020-01-27_cleanup_cdx.md
@@ -0,0 +1,34 @@
+
+Accidentally seem to have backfilled many CDX lines with non-PDF content.
+Should clear these out!
+
+Something like:
+
+ mimetype = 'text/html'
+ not in file_meta
+
+Or maybe instead:
+
+ mimetype = 'text/html'
+ not in file_meta
+
+SQL:
+
+ SELECT * FROM cdx WHERE mimetype = 'text/html' AND row_created < '2019-10-01' LIMIT 5;
+ SELECT COUNT(1) FROM cdx WHERE mimetype = 'text/html' AND row_created < '2019-10-01';
+ => 24841846
+
+ SELECT * FROM cdx LEFT JOIN file_meta ON file_meta.sha1hex = cdx.sha1hex WHERE cdx.mimetype = 'text/html' AND file_meta.sha256hex IS NULL LIMIT 5;
+ SELECT COUNT(1) FROM cdx LEFT JOIN file_meta ON cdx.sha1hex = file_meta.sha1hex WHERE cdx.mimetype = 'text/html' AND file_meta.sha256hex IS NULL;
+ => 24547552
+
+ DELETE FROM cdx
+ WHERE sha1hex IN
+ (SELECT cdx.sha1hex
+ FROM cdx
+ LEFT JOIN file_meta ON file_meta.sha1hex = cdx.sha1hex
+ WHERE cdx.mimetype = 'text/html' AND file_meta.sha256hex IS NULL);
+ => DELETE 24553428
+
+Slightly more... probably should have had a "AND cdx.mimetype = 'text/html'" in
+the DELETE WHERE clause.