diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-08 17:12:18 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-08 17:12:18 -0800 |
commit | e016a05a62a61a277218b4ddb8378a310e241207 (patch) | |
tree | d80996558180636e2321fd24171ab8ad24273f75 | |
parent | 227e182c6b65d75d48e022f8bb56d1326854d00e (diff) | |
download | sandcrawler-e016a05a62a61a277218b4ddb8378a310e241207.tar.gz sandcrawler-e016a05a62a61a277218b4ddb8378a310e241207.zip |
ingest tool: more ingest control args
-rwxr-xr-x | python/ingest_file.py | 11 |
1 files changed, 10 insertions, 1 deletions
diff --git a/python/ingest_file.py b/python/ingest_file.py index 19938df..20b6d67 100755 --- a/python/ingest_file.py +++ b/python/ingest_file.py @@ -27,7 +27,10 @@ def run_single_ingest(args): def run_requests(args): # TODO: switch to using JsonLinePusher - ingester = IngestFileWorker() + ingester = IngestFileWorker( + try_spn2=not args.no_spn2, + html_quick_mode=args.html_quick_mode, + ) for l in args.json_file: request = json.loads(l.strip()) result = ingester.process(request) @@ -68,6 +71,12 @@ def main(): sub_requests = subparsers.add_parser('requests', help="takes a series of ingest requests (JSON, per line) and runs each") + sub_requests.add_argument('--no-spn2', + action='store_true', + help="don't use live web (SPNv2)") + sub_requests.add_argument('--html-quick-mode', + action='store_true', + help="don't fetch individual sub-resources, just use CDX") sub_requests.set_defaults(func=run_requests) sub_requests.add_argument('json_file', help="JSON file (request per line) to import from (or stdin)", |