diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-11-12 13:22:43 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-11-12 13:22:43 -0800 |
commit | 9529cbb2660897ce3ffe3986f60eafbf3596495d (patch) | |
tree | 2ef665564dacb6737c18bab65b5810527765f246 | |
parent | e0ad2e3be5286b2703df8f0a98b450658e28d28b (diff) | |
download | sandcrawler-9529cbb2660897ce3ffe3986f60eafbf3596495d.tar.gz sandcrawler-9529cbb2660897ce3ffe3986f60eafbf3596495d.zip |
add note to CDX backfill script that we should be filtering (oops)
-rwxr-xr-x | sql/backfill/backfill_cdx.py | 1 |
1 files changed, 1 insertions, 0 deletions
diff --git a/sql/backfill/backfill_cdx.py b/sql/backfill/backfill_cdx.py index 1c452ca..f929502 100755 --- a/sql/backfill/backfill_cdx.py +++ b/sql/backfill/backfill_cdx.py @@ -109,6 +109,7 @@ def stdin_to_pg(): info = parse_cdx_line(l) if not info: continue + # XXX: filter to, eg, PDF or octet/stream (derp) batch.append(info) counts['total'] += 1 if len(batch) >= 1000: |