diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-03-02 16:37:08 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-03-02 16:37:08 -0800 | 
| commit | b45e1ac6638edb9d634269a343d05eff90daa31e (patch) | |
| tree | 0c9e6bcedec7c782e2bbd54347a4c614077fd22f | |
| parent | 6d41261ac417c61a61d0c794fa07639f454bcd52 (diff) | |
| download | sandcrawler-b45e1ac6638edb9d634269a343d05eff90daa31e.tar.gz sandcrawler-b45e1ac6638edb9d634269a343d05eff90daa31e.zip  | |
ingest: add force_recrawl flag to skip historical wayback lookup
| -rw-r--r-- | proposals/2019_ingest.md | 1 | ||||
| -rw-r--r-- | python/sandcrawler/ingest.py | 8 | 
2 files changed, 6 insertions, 3 deletions
diff --git a/proposals/2019_ingest.md b/proposals/2019_ingest.md index 196dbea..c649809 100644 --- a/proposals/2019_ingest.md +++ b/proposals/2019_ingest.md @@ -98,6 +98,7 @@ HTML? Or both? Let's just recrawl.      `savepapernow-web`    - `release_stage`: optional. indicates the release stage of fulltext expected to be found at this URL    - `rel`: optional. indicates the link type +  - `force_recrawl`: optional. if true, will always SPNv2 (won't check wayback)    - `oa_status`: optional. unpaywall schema    - `edit_extra`: additional metadata to be included in any eventual fatcat commits.    - `fatcat` diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 8e0efeb..9a4335b 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -111,7 +111,7 @@ class IngestFileWorker(SandcrawlerWorker):          else:              return None -    def find_resource(self, url, best_mimetype=None): +    def find_resource(self, url, best_mimetype=None, force_recrawl=False):          """          Looks in wayback for a resource starting at the URL, following any          redirects. If a hit isn't found, try crawling with SPN. @@ -125,7 +125,7 @@ class IngestFileWorker(SandcrawlerWorker):          if url.startswith("http://archive.org/") or url.startswith("https://archive.org/"):              raise NotImplementedError("fetching from archive.org not implemented yet") -        if self.try_wayback: +        if self.try_wayback and not force_recrawl:              via = "wayback"              resource = self.wayback_client.lookup_resource(url, best_mimetype) @@ -225,6 +225,8 @@ class IngestFileWorker(SandcrawlerWorker):          ingest_type = request.get('ingest_type')          base_url = request['base_url'] +        force_recrawl = bool(request.get('force_recrawl', False)) +          for block in self.base_url_blocklist:              if block in base_url:                  print("[SKIP {}\t] {}".format(ingest_type, base_url), file=sys.stderr) @@ -251,7 +253,7 @@ class IngestFileWorker(SandcrawlerWorker):              result['hops'] = hops              try: -                resource = self.find_resource(next_url, best_mimetype) +                resource = self.find_resource(next_url, best_mimetype, force_recrawl=force_recrawl)              except SavePageNowError as e:                  result['status'] = 'spn2-error'                  result['error_message'] = str(e)[:1600]  | 
