diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-08 14:15:56 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-08 14:15:56 -0800 |
commit | eabef14c79fb36e6076c215887b69630c482a729 (patch) | |
tree | 12f833e9ce0307eebdfac8427bfb052198879a15 /python/sandcrawler | |
parent | 00ed69dd00d07344d62c5adad4e9d15c721c3bb1 (diff) | |
download | sandcrawler-eabef14c79fb36e6076c215887b69630c482a729.tar.gz sandcrawler-eabef14c79fb36e6076c215887b69630c482a729.zip |
ingest tool: flag for HTML quick mode (CDX-only)
Diffstat (limited to 'python/sandcrawler')
-rw-r--r-- | python/sandcrawler/ingest.py | 3 |
1 files changed, 2 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index c9b3d2f..028f2b2 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -81,7 +81,7 @@ class IngestFileWorker(SandcrawlerWorker): self.try_existing_pdfextract = kwargs.get('try_existing_pdfextract', True) self.try_wayback = kwargs.get('try_wayback', True) self.try_spn2 = kwargs.get('try_spn2', True) - self.html_quick_mode = False + self.html_quick_mode = kwargs.get('html_quick_mode', False) self.adblock_rules = load_adblock_rules() self.max_html_resources = 200 @@ -374,6 +374,7 @@ class IngestFileWorker(SandcrawlerWorker): try: if self.html_quick_mode: + print(" WARN: running quick CDX-only fetches", file=sys.stderr) full_resources = quick_fetch_html_resources(raw_resources, self.wayback_client.cdx_client, when) else: full_resources = fetch_html_resources(raw_resources, self.wayback_client, when) |