aboutsummaryrefslogtreecommitdiffstats
path: root/python/ingest_file.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-08 14:15:56 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-08 14:15:56 -0800
commiteabef14c79fb36e6076c215887b69630c482a729 (patch)
tree12f833e9ce0307eebdfac8427bfb052198879a15 /python/ingest_file.py
parent00ed69dd00d07344d62c5adad4e9d15c721c3bb1 (diff)
downloadsandcrawler-eabef14c79fb36e6076c215887b69630c482a729.tar.gz
sandcrawler-eabef14c79fb36e6076c215887b69630c482a729.zip
ingest tool: flag for HTML quick mode (CDX-only)
Diffstat (limited to 'python/ingest_file.py')
-rwxr-xr-xpython/ingest_file.py4
1 files changed, 4 insertions, 0 deletions
diff --git a/python/ingest_file.py b/python/ingest_file.py
index 73e6a13..19938df 100755
--- a/python/ingest_file.py
+++ b/python/ingest_file.py
@@ -19,6 +19,7 @@ def run_single_ingest(args):
request['force_recrawl'] = True
ingester = IngestFileWorker(
try_spn2=not args.no_spn2,
+ html_quick_mode=args.html_quick_mode,
)
result = ingester.process(request)
print(json.dumps(result, sort_keys=True))
@@ -59,6 +60,9 @@ def main():
sub_single.add_argument('--ingest-type',
default="pdf",
help="type of ingest (pdf, html, etc)")
+ sub_single.add_argument('--html-quick-mode',
+ action='store_true',
+ help="don't fetch individual sub-resources, just use CDX")
sub_single.add_argument('url',
help="URL of paper to fetch")