diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-07-13 12:34:10 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-07-13 12:34:10 -0700 |
commit | b376a034f0b5d5996a7e0aaeb0cb473fa5a1c427 (patch) | |
tree | 808c2fd362770dbe85cb36681f798d4f09196d3b | |
parent | c468443d325a5e091162cfb5f85697679e87eb72 (diff) | |
download | sandcrawler-b376a034f0b5d5996a7e0aaeb0cb473fa5a1c427.tar.gz sandcrawler-b376a034f0b5d5996a7e0aaeb0cb473fa5a1c427.zip |
crawl: small comment updates
-rw-r--r-- | python/sandcrawler/ia.py | 9 |
1 files changed, 6 insertions, 3 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 806f1e7..9b2635b 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -648,11 +648,12 @@ class WaybackClient: status="success", terminal_url=cdx_row.url, terminal_dt=cdx_row.datetime, - terminal_status_code=resource.revisit_cdx.status_code, # ? + terminal_status_code=resource.revisit_cdx.status_code, body=resource.body, cdx=cdx_row, revisit_cdx=resource.revisit_cdx, ) + # else, continue processing with revisit record if cdx_row.status_code in (200, 226): revisit_cdx = None @@ -927,9 +928,11 @@ class SavePageNowClient: def crawl_resource(self, start_url, wayback_client, force_simple_get=0): """ - Runs a SPN2 crawl, then fetches body from wayback. + Runs a SPN2 crawl, then fetches body. - TODO: possible to fetch from petabox? + There is a delay between SPN2 crawls and WARC upload to petabox, so we + need to fetch the body via wayback replay instead of petabox + range-request. """ # HACK: capture CNKI domains with outlinks (for COVID-19 crawling) |