From b45e1ac6638edb9d634269a343d05eff90daa31e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 2 Mar 2020 16:37:08 -0800 Subject: ingest: add force_recrawl flag to skip historical wayback lookup --- proposals/2019_ingest.md | 1 + python/sandcrawler/ingest.py | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/proposals/2019_ingest.md b/proposals/2019_ingest.md index 196dbea..c649809 100644 --- a/proposals/2019_ingest.md +++ b/proposals/2019_ingest.md @@ -98,6 +98,7 @@ HTML? Or both? Let's just recrawl. `savepapernow-web` - `release_stage`: optional. indicates the release stage of fulltext expected to be found at this URL - `rel`: optional. indicates the link type + - `force_recrawl`: optional. if true, will always SPNv2 (won't check wayback) - `oa_status`: optional. unpaywall schema - `edit_extra`: additional metadata to be included in any eventual fatcat commits. - `fatcat` diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 8e0efeb..9a4335b 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -111,7 +111,7 @@ class IngestFileWorker(SandcrawlerWorker): else: return None - def find_resource(self, url, best_mimetype=None): + def find_resource(self, url, best_mimetype=None, force_recrawl=False): """ Looks in wayback for a resource starting at the URL, following any redirects. If a hit isn't found, try crawling with SPN. @@ -125,7 +125,7 @@ class IngestFileWorker(SandcrawlerWorker): if url.startswith("http://archive.org/") or url.startswith("https://archive.org/"): raise NotImplementedError("fetching from archive.org not implemented yet") - if self.try_wayback: + if self.try_wayback and not force_recrawl: via = "wayback" resource = self.wayback_client.lookup_resource(url, best_mimetype) @@ -225,6 +225,8 @@ class IngestFileWorker(SandcrawlerWorker): ingest_type = request.get('ingest_type') base_url = request['base_url'] + force_recrawl = bool(request.get('force_recrawl', False)) + for block in self.base_url_blocklist: if block in base_url: print("[SKIP {}\t] {}".format(ingest_type, base_url), file=sys.stderr) @@ -251,7 +253,7 @@ class IngestFileWorker(SandcrawlerWorker): result['hops'] = hops try: - resource = self.find_resource(next_url, best_mimetype) + resource = self.find_resource(next_url, best_mimetype, force_recrawl=force_recrawl) except SavePageNowError as e: result['status'] = 'spn2-error' result['error_message'] = str(e)[:1600] -- cgit v1.2.3