aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-03-10 22:39:03 -0700
committerBryan Newbold <bnewbold@archive.org>2020-03-10 22:39:03 -0700
commite7ba648fce4b8359358c6661b6ecb34576efc70d (patch)
treec21d3c7d1baf2da778ff591ce91c5c82576c5b47
parent19e094f820e7c619b9180616daf1586c4daa66bd (diff)
downloadsandcrawler-e7ba648fce4b8359358c6661b6ecb34576efc70d.tar.gz
sandcrawler-e7ba648fce4b8359358c6661b6ecb34576efc70d.zip
ingest_file: --no-spn2 flag for single command
-rwxr-xr-xpython/ingest_file.py7
1 files changed, 6 insertions, 1 deletions
diff --git a/python/ingest_file.py b/python/ingest_file.py
index d4fdcac..f6f694e 100755
--- a/python/ingest_file.py
+++ b/python/ingest_file.py
@@ -17,7 +17,9 @@ def run_single_ingest(args):
)
if args.force_recrawl:
request['force_recrawl'] = True
- ingester = IngestFileWorker()
+ ingester = IngestFileWorker(
+ try_spn2=not args.no_spn2,
+ )
result = ingester.process(request)
print(json.dumps(result, sort_keys=True))
return result
@@ -51,6 +53,9 @@ def main():
sub_single.add_argument('--force-recrawl',
action='store_true',
help="ignore GWB history and use SPNv2 to re-crawl")
+ sub_single.add_argument('--no-spn2',
+ action='store_true',
+ help="don't use live web (SPNv2)")
sub_single.add_argument('--type',
default="pdf",
help="type of ingest (pdf, html, etc)")