From b45e1ac6638edb9d634269a343d05eff90daa31e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 2 Mar 2020 16:37:08 -0800 Subject: ingest: add force_recrawl flag to skip historical wayback lookup --- python/sandcrawler/ingest.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'python') diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 8e0efeb..9a4335b 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -111,7 +111,7 @@ class IngestFileWorker(SandcrawlerWorker): else: return None - def find_resource(self, url, best_mimetype=None): + def find_resource(self, url, best_mimetype=None, force_recrawl=False): """ Looks in wayback for a resource starting at the URL, following any redirects. If a hit isn't found, try crawling with SPN. @@ -125,7 +125,7 @@ class IngestFileWorker(SandcrawlerWorker): if url.startswith("http://archive.org/") or url.startswith("https://archive.org/"): raise NotImplementedError("fetching from archive.org not implemented yet") - if self.try_wayback: + if self.try_wayback and not force_recrawl: via = "wayback" resource = self.wayback_client.lookup_resource(url, best_mimetype) @@ -225,6 +225,8 @@ class IngestFileWorker(SandcrawlerWorker): ingest_type = request.get('ingest_type') base_url = request['base_url'] + force_recrawl = bool(request.get('force_recrawl', False)) + for block in self.base_url_blocklist: if block in base_url: print("[SKIP {}\t] {}".format(ingest_type, base_url), file=sys.stderr) @@ -251,7 +253,7 @@ class IngestFileWorker(SandcrawlerWorker): result['hops'] = hops try: - resource = self.find_resource(next_url, best_mimetype) + resource = self.find_resource(next_url, best_mimetype, force_recrawl=force_recrawl) except SavePageNowError as e: result['status'] = 'spn2-error' result['error_message'] = str(e)[:1600] -- cgit v1.2.3