diff options
Diffstat (limited to 'python/sandcrawler/ingest_html.py')
-rw-r--r-- | python/sandcrawler/ingest_html.py | 5 |
1 files changed, 2 insertions, 3 deletions
diff --git a/python/sandcrawler/ingest_html.py b/python/sandcrawler/ingest_html.py index 0ff7fe0..3989f9e 100644 --- a/python/sandcrawler/ingest_html.py +++ b/python/sandcrawler/ingest_html.py @@ -1,6 +1,7 @@ import argparse import datetime import json +import logging import sys import xml.etree.ElementTree as ET from typing import Any, List, Optional, Tuple @@ -156,9 +157,7 @@ def quick_fetch_html_resources( if not cdx_row: raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}") if cdx_row.url != resource["url"] and not url_fuzzy_equal(cdx_row.url, resource["url"]): - print( - f" WARN: CDX fuzzy match: {cdx_row.url} != {resource['url']}", file=sys.stderr - ) + logging.warn(f"CDX fuzzy match expected={resource['url']} found={cdx_row.url}") if not cdx_row.status_code: # TODO: fall back to a full fetch? print(" WARN: skipping revisit record", file=sys.stderr) |