From b376a034f0b5d5996a7e0aaeb0cb473fa5a1c427 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Tue, 13 Jul 2021 12:34:10 -0700
Subject: crawl: small comment updates

---
 python/sandcrawler/ia.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'python/sandcrawler/ia.py')

diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 806f1e7..9b2635b 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -648,11 +648,12 @@ class WaybackClient:
                         status="success",
                         terminal_url=cdx_row.url,
                         terminal_dt=cdx_row.datetime,
-                        terminal_status_code=resource.revisit_cdx.status_code, # ?
+                        terminal_status_code=resource.revisit_cdx.status_code,
                         body=resource.body,
                         cdx=cdx_row,
                         revisit_cdx=resource.revisit_cdx,
                     )
+                # else, continue processing with revisit record
 
             if cdx_row.status_code in (200, 226):
                 revisit_cdx = None
@@ -927,9 +928,11 @@ class SavePageNowClient:
 
     def crawl_resource(self, start_url, wayback_client, force_simple_get=0):
         """
-        Runs a SPN2 crawl, then fetches body from wayback.
+        Runs a SPN2 crawl, then fetches body.
 
-        TODO: possible to fetch from petabox?
+        There is a delay between SPN2 crawls and WARC upload to petabox, so we
+        need to fetch the body via wayback replay instead of petabox
+        range-request.
         """
 
         # HACK: capture CNKI domains with outlinks (for COVID-19 crawling)
-- 
cgit v1.2.3