diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-11-08 14:15:56 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-08 14:15:56 -0800 | 
| commit | eabef14c79fb36e6076c215887b69630c482a729 (patch) | |
| tree | 12f833e9ce0307eebdfac8427bfb052198879a15 /python/ingest_file.py | |
| parent | 00ed69dd00d07344d62c5adad4e9d15c721c3bb1 (diff) | |
| download | sandcrawler-eabef14c79fb36e6076c215887b69630c482a729.tar.gz sandcrawler-eabef14c79fb36e6076c215887b69630c482a729.zip | |
ingest tool: flag for HTML quick mode (CDX-only)
Diffstat (limited to 'python/ingest_file.py')
| -rwxr-xr-x | python/ingest_file.py | 4 | 
1 files changed, 4 insertions, 0 deletions
| diff --git a/python/ingest_file.py b/python/ingest_file.py index 73e6a13..19938df 100755 --- a/python/ingest_file.py +++ b/python/ingest_file.py @@ -19,6 +19,7 @@ def run_single_ingest(args):          request['force_recrawl'] = True      ingester = IngestFileWorker(          try_spn2=not args.no_spn2, +        html_quick_mode=args.html_quick_mode,      )      result = ingester.process(request)      print(json.dumps(result, sort_keys=True)) @@ -59,6 +60,9 @@ def main():      sub_single.add_argument('--ingest-type',          default="pdf",          help="type of ingest (pdf, html, etc)") +    sub_single.add_argument('--html-quick-mode', +        action='store_true', +        help="don't fetch individual sub-resources, just use CDX")      sub_single.add_argument('url',          help="URL of paper to fetch") | 
