aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-07-13 12:34:10 -0700
committerBryan Newbold <bnewbold@archive.org>2021-07-13 12:34:10 -0700
commitb376a034f0b5d5996a7e0aaeb0cb473fa5a1c427 (patch)
tree808c2fd362770dbe85cb36681f798d4f09196d3b
parentc468443d325a5e091162cfb5f85697679e87eb72 (diff)
downloadsandcrawler-b376a034f0b5d5996a7e0aaeb0cb473fa5a1c427.tar.gz
sandcrawler-b376a034f0b5d5996a7e0aaeb0cb473fa5a1c427.zip
crawl: small comment updates
-rw-r--r--python/sandcrawler/ia.py9
1 files changed, 6 insertions, 3 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 806f1e7..9b2635b 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -648,11 +648,12 @@ class WaybackClient:
status="success",
terminal_url=cdx_row.url,
terminal_dt=cdx_row.datetime,
- terminal_status_code=resource.revisit_cdx.status_code, # ?
+ terminal_status_code=resource.revisit_cdx.status_code,
body=resource.body,
cdx=cdx_row,
revisit_cdx=resource.revisit_cdx,
)
+ # else, continue processing with revisit record
if cdx_row.status_code in (200, 226):
revisit_cdx = None
@@ -927,9 +928,11 @@ class SavePageNowClient:
def crawl_resource(self, start_url, wayback_client, force_simple_get=0):
"""
- Runs a SPN2 crawl, then fetches body from wayback.
+ Runs a SPN2 crawl, then fetches body.
- TODO: possible to fetch from petabox?
+ There is a delay between SPN2 crawls and WARC upload to petabox, so we
+ need to fetch the body via wayback replay instead of petabox
+ range-request.
"""
# HACK: capture CNKI domains with outlinks (for COVID-19 crawling)