diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2019-11-12 13:22:43 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2019-11-12 13:22:43 -0800 | 
| commit | 9529cbb2660897ce3ffe3986f60eafbf3596495d (patch) | |
| tree | 2ef665564dacb6737c18bab65b5810527765f246 /sql | |
| parent | e0ad2e3be5286b2703df8f0a98b450658e28d28b (diff) | |
| download | sandcrawler-9529cbb2660897ce3ffe3986f60eafbf3596495d.tar.gz sandcrawler-9529cbb2660897ce3ffe3986f60eafbf3596495d.zip | |
add note to CDX backfill script that we should be filtering (oops)
Diffstat (limited to 'sql')
| -rwxr-xr-x | sql/backfill/backfill_cdx.py | 1 | 
1 files changed, 1 insertions, 0 deletions
| diff --git a/sql/backfill/backfill_cdx.py b/sql/backfill/backfill_cdx.py index 1c452ca..f929502 100755 --- a/sql/backfill/backfill_cdx.py +++ b/sql/backfill/backfill_cdx.py @@ -109,6 +109,7 @@ def stdin_to_pg():          info = parse_cdx_line(l)          if not info:              continue +        # XXX: filter to, eg, PDF or octet/stream (derp)          batch.append(info)          counts['total'] += 1          if len(batch) >= 1000: | 
