crawl: small comment updates

author: Bryan Newbold <bnewbold@archive.org> 2021-07-13 12:34:10 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-07-13 12:34:10 -0700
commit: b376a034f0b5d5996a7e0aaeb0cb473fa5a1c427 (patch)
tree: 808c2fd362770dbe85cb36681f798d4f09196d3b
parent: c468443d325a5e091162cfb5f85697679e87eb72 (diff)
download: sandcrawler-b376a034f0b5d5996a7e0aaeb0cb473fa5a1c427.tar.gz
sandcrawler-b376a034f0b5d5996a7e0aaeb0cb473fa5a1c427.zip
1 files changed, 6 insertions, 3 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 806f1e7..9b2635b 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -648,11 +648,12 @@ class WaybackClient:
                         status="success",
                         terminal_url=cdx_row.url,
                         terminal_dt=cdx_row.datetime,
-                        terminal_status_code=resource.revisit_cdx.status_code, # ?
+                        terminal_status_code=resource.revisit_cdx.status_code,
                         body=resource.body,
                         cdx=cdx_row,
                         revisit_cdx=resource.revisit_cdx,
                     )
+                # else, continue processing with revisit record
 
             if cdx_row.status_code in (200, 226):
                 revisit_cdx = None
@@ -927,9 +928,11 @@ class SavePageNowClient:
 
     def crawl_resource(self, start_url, wayback_client, force_simple_get=0):
         """
-        Runs a SPN2 crawl, then fetches body from wayback.
+        Runs a SPN2 crawl, then fetches body.
 
-        TODO: possible to fetch from petabox?
+        There is a delay between SPN2 crawls and WARC upload to petabox, so we
+        need to fetch the body via wayback replay instead of petabox
+        range-request.
         """
 
         # HACK: capture CNKI domains with outlinks (for COVID-19 crawling)
author	Bryan Newbold <bnewbold@archive.org>	2021-07-13 12:34:10 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-07-13 12:34:10 -0700
commit	b376a034f0b5d5996a7e0aaeb0cb473fa5a1c427 (patch)
tree	808c2fd362770dbe85cb36681f798d4f09196d3b
parent	c468443d325a5e091162cfb5f85697679e87eb72 (diff)
download	sandcrawler-b376a034f0b5d5996a7e0aaeb0cb473fa5a1c427.tar.gz sandcrawler-b376a034f0b5d5996a7e0aaeb0cb473fa5a1c427.zip