diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-08 14:15:56 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-08 14:15:56 -0800 |
commit | eabef14c79fb36e6076c215887b69630c482a729 (patch) | |
tree | 12f833e9ce0307eebdfac8427bfb052198879a15 /python/ingest_file.py | |
parent | 00ed69dd00d07344d62c5adad4e9d15c721c3bb1 (diff) | |
download | sandcrawler-eabef14c79fb36e6076c215887b69630c482a729.tar.gz sandcrawler-eabef14c79fb36e6076c215887b69630c482a729.zip |
ingest tool: flag for HTML quick mode (CDX-only)
Diffstat (limited to 'python/ingest_file.py')
-rwxr-xr-x | python/ingest_file.py | 4 |
1 files changed, 4 insertions, 0 deletions
diff --git a/python/ingest_file.py b/python/ingest_file.py index 73e6a13..19938df 100755 --- a/python/ingest_file.py +++ b/python/ingest_file.py @@ -19,6 +19,7 @@ def run_single_ingest(args): request['force_recrawl'] = True ingester = IngestFileWorker( try_spn2=not args.no_spn2, + html_quick_mode=args.html_quick_mode, ) result = ingester.process(request) print(json.dumps(result, sort_keys=True)) @@ -59,6 +60,9 @@ def main(): sub_single.add_argument('--ingest-type', default="pdf", help="type of ingest (pdf, html, etc)") + sub_single.add_argument('--html-quick-mode', + action='store_true', + help="don't fetch individual sub-resources, just use CDX") sub_single.add_argument('url', help="URL of paper to fetch") |