aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-08 17:12:18 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-08 17:12:18 -0800
commite016a05a62a61a277218b4ddb8378a310e241207 (patch)
treed80996558180636e2321fd24171ab8ad24273f75
parent227e182c6b65d75d48e022f8bb56d1326854d00e (diff)
downloadsandcrawler-e016a05a62a61a277218b4ddb8378a310e241207.tar.gz
sandcrawler-e016a05a62a61a277218b4ddb8378a310e241207.zip
ingest tool: more ingest control args
-rwxr-xr-xpython/ingest_file.py11
1 files changed, 10 insertions, 1 deletions
diff --git a/python/ingest_file.py b/python/ingest_file.py
index 19938df..20b6d67 100755
--- a/python/ingest_file.py
+++ b/python/ingest_file.py
@@ -27,7 +27,10 @@ def run_single_ingest(args):
def run_requests(args):
# TODO: switch to using JsonLinePusher
- ingester = IngestFileWorker()
+ ingester = IngestFileWorker(
+ try_spn2=not args.no_spn2,
+ html_quick_mode=args.html_quick_mode,
+ )
for l in args.json_file:
request = json.loads(l.strip())
result = ingester.process(request)
@@ -68,6 +71,12 @@ def main():
sub_requests = subparsers.add_parser('requests',
help="takes a series of ingest requests (JSON, per line) and runs each")
+ sub_requests.add_argument('--no-spn2',
+ action='store_true',
+ help="don't use live web (SPNv2)")
+ sub_requests.add_argument('--html-quick-mode',
+ action='store_true',
+ help="don't fetch individual sub-resources, just use CDX")
sub_requests.set_defaults(func=run_requests)
sub_requests.add_argument('json_file',
help="JSON file (request per line) to import from (or stdin)",