aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-11-12 13:22:43 -0800
committerBryan Newbold <bnewbold@archive.org>2019-11-12 13:22:43 -0800
commit9529cbb2660897ce3ffe3986f60eafbf3596495d (patch)
tree2ef665564dacb6737c18bab65b5810527765f246
parente0ad2e3be5286b2703df8f0a98b450658e28d28b (diff)
downloadsandcrawler-9529cbb2660897ce3ffe3986f60eafbf3596495d.tar.gz
sandcrawler-9529cbb2660897ce3ffe3986f60eafbf3596495d.zip
add note to CDX backfill script that we should be filtering (oops)
-rwxr-xr-xsql/backfill/backfill_cdx.py1
1 files changed, 1 insertions, 0 deletions
diff --git a/sql/backfill/backfill_cdx.py b/sql/backfill/backfill_cdx.py
index 1c452ca..f929502 100755
--- a/sql/backfill/backfill_cdx.py
+++ b/sql/backfill/backfill_cdx.py
@@ -109,6 +109,7 @@ def stdin_to_pg():
info = parse_cdx_line(l)
if not info:
continue
+ # XXX: filter to, eg, PDF or octet/stream (derp)
batch.append(info)
counts['total'] += 1
if len(batch) >= 1000: