aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/sandcrawler/ia.py9
1 files changed, 6 insertions, 3 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 806f1e7..9b2635b 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -648,11 +648,12 @@ class WaybackClient:
status="success",
terminal_url=cdx_row.url,
terminal_dt=cdx_row.datetime,
- terminal_status_code=resource.revisit_cdx.status_code, # ?
+ terminal_status_code=resource.revisit_cdx.status_code,
body=resource.body,
cdx=cdx_row,
revisit_cdx=resource.revisit_cdx,
)
+ # else, continue processing with revisit record
if cdx_row.status_code in (200, 226):
revisit_cdx = None
@@ -927,9 +928,11 @@ class SavePageNowClient:
def crawl_resource(self, start_url, wayback_client, force_simple_get=0):
"""
- Runs a SPN2 crawl, then fetches body from wayback.
+ Runs a SPN2 crawl, then fetches body.
- TODO: possible to fetch from petabox?
+ There is a delay between SPN2 crawls and WARC upload to petabox, so we
+ need to fetch the body via wayback replay instead of petabox
+ range-request.
"""
# HACK: capture CNKI domains with outlinks (for COVID-19 crawling)