aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/html_ingest.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/html_ingest.py')
-rw-r--r--python/sandcrawler/html_ingest.py4
1 files changed, 4 insertions, 0 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py
index 42bd946..a8ba0d6 100644
--- a/python/sandcrawler/html_ingest.py
+++ b/python/sandcrawler/html_ingest.py
@@ -127,6 +127,10 @@ def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient,
raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}")
if cdx_row.url != resource['url']:
print(f" WARN: CDX fuzzy match: {cdx_row.url} != {resource['url']}", file=sys.stderr)
+ if not cdx_row.status_code:
+ # TODO: fall back to a full fetch?
+ print(f" WARN: skipping revisit record", file=sys.stderr)
+ continue
full.append(WebResource(
surt=cdx_row.surt,
timestamp=cdx_row.datetime,