ingest: add force_recrawl flag to skip historical wayback lookup

author: Bryan Newbold <bnewbold@archive.org> 2020-03-02 16:37:08 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2020-03-02 16:37:08 -0800
commit: b45e1ac6638edb9d634269a343d05eff90daa31e (patch)
tree: 0c9e6bcedec7c782e2bbd54347a4c614077fd22f
parent: 6d41261ac417c61a61d0c794fa07639f454bcd52 (diff)
download: sandcrawler-b45e1ac6638edb9d634269a343d05eff90daa31e.tar.gz
sandcrawler-b45e1ac6638edb9d634269a343d05eff90daa31e.zip
2 files changed, 6 insertions, 3 deletions
diff --git a/proposals/2019_ingest.md b/proposals/2019_ingest.md
index 196dbea..c649809 100644
--- a/proposals/2019_ingest.md
+++ b/proposals/2019_ingest.md
@@ -98,6 +98,7 @@ HTML? Or both? Let's just recrawl.
     `savepapernow-web`
   - `release_stage`: optional. indicates the release stage of fulltext expected to be found at this URL
   - `rel`: optional. indicates the link type
+  - `force_recrawl`: optional. if true, will always SPNv2 (won't check wayback)
   - `oa_status`: optional. unpaywall schema
   - `edit_extra`: additional metadata to be included in any eventual fatcat commits.
   - `fatcat`
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 8e0efeb..9a4335b 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -111,7 +111,7 @@ class IngestFileWorker(SandcrawlerWorker):
         else:
             return None
 
-    def find_resource(self, url, best_mimetype=None):
+    def find_resource(self, url, best_mimetype=None, force_recrawl=False):
         """
         Looks in wayback for a resource starting at the URL, following any
         redirects. If a hit isn't found, try crawling with SPN.
@@ -125,7 +125,7 @@ class IngestFileWorker(SandcrawlerWorker):
         if url.startswith("http://archive.org/") or url.startswith("https://archive.org/"):
             raise NotImplementedError("fetching from archive.org not implemented yet")
 
-        if self.try_wayback:
+        if self.try_wayback and not force_recrawl:
             via = "wayback"
             resource = self.wayback_client.lookup_resource(url, best_mimetype)
 
@@ -225,6 +225,8 @@ class IngestFileWorker(SandcrawlerWorker):
         ingest_type = request.get('ingest_type')
         base_url = request['base_url']
 
+        force_recrawl = bool(request.get('force_recrawl', False))
+
         for block in self.base_url_blocklist:
             if block in base_url:
                 print("[SKIP {}\t] {}".format(ingest_type, base_url), file=sys.stderr)
@@ -251,7 +253,7 @@ class IngestFileWorker(SandcrawlerWorker):
 
             result['hops'] = hops
             try:
-                resource = self.find_resource(next_url, best_mimetype)
+                resource = self.find_resource(next_url, best_mimetype, force_recrawl=force_recrawl)
             except SavePageNowError as e:
                 result['status'] = 'spn2-error'
                 result['error_message'] = str(e)[:1600]
author	Bryan Newbold <bnewbold@archive.org>	2020-03-02 16:37:08 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2020-03-02 16:37:08 -0800
commit	b45e1ac6638edb9d634269a343d05eff90daa31e (patch)
tree	0c9e6bcedec7c782e2bbd54347a4c614077fd22f
parent	6d41261ac417c61a61d0c794fa07639f454bcd52 (diff)
download	sandcrawler-b45e1ac6638edb9d634269a343d05eff90daa31e.tar.gz sandcrawler-b45e1ac6638edb9d634269a343d05eff90daa31e.zip