aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-03-02 16:37:08 -0800
committerBryan Newbold <bnewbold@archive.org>2020-03-02 16:37:08 -0800
commitb45e1ac6638edb9d634269a343d05eff90daa31e (patch)
tree0c9e6bcedec7c782e2bbd54347a4c614077fd22f /python
parent6d41261ac417c61a61d0c794fa07639f454bcd52 (diff)
downloadsandcrawler-b45e1ac6638edb9d634269a343d05eff90daa31e.tar.gz
sandcrawler-b45e1ac6638edb9d634269a343d05eff90daa31e.zip
ingest: add force_recrawl flag to skip historical wayback lookup
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ingest.py8
1 files changed, 5 insertions, 3 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 8e0efeb..9a4335b 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -111,7 +111,7 @@ class IngestFileWorker(SandcrawlerWorker):
else:
return None
- def find_resource(self, url, best_mimetype=None):
+ def find_resource(self, url, best_mimetype=None, force_recrawl=False):
"""
Looks in wayback for a resource starting at the URL, following any
redirects. If a hit isn't found, try crawling with SPN.
@@ -125,7 +125,7 @@ class IngestFileWorker(SandcrawlerWorker):
if url.startswith("http://archive.org/") or url.startswith("https://archive.org/"):
raise NotImplementedError("fetching from archive.org not implemented yet")
- if self.try_wayback:
+ if self.try_wayback and not force_recrawl:
via = "wayback"
resource = self.wayback_client.lookup_resource(url, best_mimetype)
@@ -225,6 +225,8 @@ class IngestFileWorker(SandcrawlerWorker):
ingest_type = request.get('ingest_type')
base_url = request['base_url']
+ force_recrawl = bool(request.get('force_recrawl', False))
+
for block in self.base_url_blocklist:
if block in base_url:
print("[SKIP {}\t] {}".format(ingest_type, base_url), file=sys.stderr)
@@ -251,7 +253,7 @@ class IngestFileWorker(SandcrawlerWorker):
result['hops'] = hops
try:
- resource = self.find_resource(next_url, best_mimetype)
+ resource = self.find_resource(next_url, best_mimetype, force_recrawl=force_recrawl)
except SavePageNowError as e:
result['status'] = 'spn2-error'
result['error_message'] = str(e)[:1600]