From eabef14c79fb36e6076c215887b69630c482a729 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 8 Nov 2020 14:15:56 -0800 Subject: ingest tool: flag for HTML quick mode (CDX-only) --- python/ingest_file.py | 4 ++++ python/sandcrawler/ingest.py | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'python') diff --git a/python/ingest_file.py b/python/ingest_file.py index 73e6a13..19938df 100755 --- a/python/ingest_file.py +++ b/python/ingest_file.py @@ -19,6 +19,7 @@ def run_single_ingest(args): request['force_recrawl'] = True ingester = IngestFileWorker( try_spn2=not args.no_spn2, + html_quick_mode=args.html_quick_mode, ) result = ingester.process(request) print(json.dumps(result, sort_keys=True)) @@ -59,6 +60,9 @@ def main(): sub_single.add_argument('--ingest-type', default="pdf", help="type of ingest (pdf, html, etc)") + sub_single.add_argument('--html-quick-mode', + action='store_true', + help="don't fetch individual sub-resources, just use CDX") sub_single.add_argument('url', help="URL of paper to fetch") diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index c9b3d2f..028f2b2 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -81,7 +81,7 @@ class IngestFileWorker(SandcrawlerWorker): self.try_existing_pdfextract = kwargs.get('try_existing_pdfextract', True) self.try_wayback = kwargs.get('try_wayback', True) self.try_spn2 = kwargs.get('try_spn2', True) - self.html_quick_mode = False + self.html_quick_mode = kwargs.get('html_quick_mode', False) self.adblock_rules = load_adblock_rules() self.max_html_resources = 200 @@ -374,6 +374,7 @@ class IngestFileWorker(SandcrawlerWorker): try: if self.html_quick_mode: + print(" WARN: running quick CDX-only fetches", file=sys.stderr) full_resources = quick_fetch_html_resources(raw_resources, self.wayback_client.cdx_client, when) else: full_resources = fetch_html_resources(raw_resources, self.wayback_client, when) -- cgit v1.2.3