aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-03-02 16:37:08 -0800
committerBryan Newbold <bnewbold@archive.org>2020-03-02 16:37:08 -0800
commitb45e1ac6638edb9d634269a343d05eff90daa31e (patch)
tree0c9e6bcedec7c782e2bbd54347a4c614077fd22f
parent6d41261ac417c61a61d0c794fa07639f454bcd52 (diff)
downloadsandcrawler-b45e1ac6638edb9d634269a343d05eff90daa31e.tar.gz
sandcrawler-b45e1ac6638edb9d634269a343d05eff90daa31e.zip
ingest: add force_recrawl flag to skip historical wayback lookup
-rw-r--r--proposals/2019_ingest.md1
-rw-r--r--python/sandcrawler/ingest.py8
2 files changed, 6 insertions, 3 deletions
diff --git a/proposals/2019_ingest.md b/proposals/2019_ingest.md
index 196dbea..c649809 100644
--- a/proposals/2019_ingest.md
+++ b/proposals/2019_ingest.md
@@ -98,6 +98,7 @@ HTML? Or both? Let's just recrawl.
`savepapernow-web`
- `release_stage`: optional. indicates the release stage of fulltext expected to be found at this URL
- `rel`: optional. indicates the link type
+ - `force_recrawl`: optional. if true, will always SPNv2 (won't check wayback)
- `oa_status`: optional. unpaywall schema
- `edit_extra`: additional metadata to be included in any eventual fatcat commits.
- `fatcat`
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 8e0efeb..9a4335b 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -111,7 +111,7 @@ class IngestFileWorker(SandcrawlerWorker):
else:
return None
- def find_resource(self, url, best_mimetype=None):
+ def find_resource(self, url, best_mimetype=None, force_recrawl=False):
"""
Looks in wayback for a resource starting at the URL, following any
redirects. If a hit isn't found, try crawling with SPN.
@@ -125,7 +125,7 @@ class IngestFileWorker(SandcrawlerWorker):
if url.startswith("http://archive.org/") or url.startswith("https://archive.org/"):
raise NotImplementedError("fetching from archive.org not implemented yet")
- if self.try_wayback:
+ if self.try_wayback and not force_recrawl:
via = "wayback"
resource = self.wayback_client.lookup_resource(url, best_mimetype)
@@ -225,6 +225,8 @@ class IngestFileWorker(SandcrawlerWorker):
ingest_type = request.get('ingest_type')
base_url = request['base_url']
+ force_recrawl = bool(request.get('force_recrawl', False))
+
for block in self.base_url_blocklist:
if block in base_url:
print("[SKIP {}\t] {}".format(ingest_type, base_url), file=sys.stderr)
@@ -251,7 +253,7 @@ class IngestFileWorker(SandcrawlerWorker):
result['hops'] = hops
try:
- resource = self.find_resource(next_url, best_mimetype)
+ resource = self.find_resource(next_url, best_mimetype, force_recrawl=force_recrawl)
except SavePageNowError as e:
result['status'] = 'spn2-error'
result['error_message'] = str(e)[:1600]