diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-03-10 22:39:03 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-03-10 22:39:03 -0700 |
commit | e7ba648fce4b8359358c6661b6ecb34576efc70d (patch) | |
tree | c21d3c7d1baf2da778ff591ce91c5c82576c5b47 | |
parent | 19e094f820e7c619b9180616daf1586c4daa66bd (diff) | |
download | sandcrawler-e7ba648fce4b8359358c6661b6ecb34576efc70d.tar.gz sandcrawler-e7ba648fce4b8359358c6661b6ecb34576efc70d.zip |
ingest_file: --no-spn2 flag for single command
-rwxr-xr-x | python/ingest_file.py | 7 |
1 files changed, 6 insertions, 1 deletions
diff --git a/python/ingest_file.py b/python/ingest_file.py index d4fdcac..f6f694e 100755 --- a/python/ingest_file.py +++ b/python/ingest_file.py @@ -17,7 +17,9 @@ def run_single_ingest(args): ) if args.force_recrawl: request['force_recrawl'] = True - ingester = IngestFileWorker() + ingester = IngestFileWorker( + try_spn2=not args.no_spn2, + ) result = ingester.process(request) print(json.dumps(result, sort_keys=True)) return result @@ -51,6 +53,9 @@ def main(): sub_single.add_argument('--force-recrawl', action='store_true', help="ignore GWB history and use SPNv2 to re-crawl") + sub_single.add_argument('--no-spn2', + action='store_true', + help="don't use live web (SPNv2)") sub_single.add_argument('--type', default="pdf", help="type of ingest (pdf, html, etc)") |