aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ingest_html.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-07-12 15:03:29 -0700
committerBryan Newbold <bnewbold@archive.org>2022-07-12 15:03:29 -0700
commitc15432c0ce52c48efabcd7e3221a5d625ef3e9d0 (patch)
tree948d8e72a7235d07a5ea5d778e8ee388003418e6 /python/sandcrawler/ingest_html.py
parent8f85ab294eae50e31efa9e31bb0bca1bca76cf8b (diff)
downloadsandcrawler-bnewbold-refactor-loggging.tar.gz
sandcrawler-bnewbold-refactor-loggging.zip
WIP: refactor logging calls in ingest pipelinesbnewbold-refactor-loggging
Diffstat (limited to 'python/sandcrawler/ingest_html.py')
-rw-r--r--python/sandcrawler/ingest_html.py5
1 files changed, 2 insertions, 3 deletions
diff --git a/python/sandcrawler/ingest_html.py b/python/sandcrawler/ingest_html.py
index 0ff7fe0..3989f9e 100644
--- a/python/sandcrawler/ingest_html.py
+++ b/python/sandcrawler/ingest_html.py
@@ -1,6 +1,7 @@
import argparse
import datetime
import json
+import logging
import sys
import xml.etree.ElementTree as ET
from typing import Any, List, Optional, Tuple
@@ -156,9 +157,7 @@ def quick_fetch_html_resources(
if not cdx_row:
raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}")
if cdx_row.url != resource["url"] and not url_fuzzy_equal(cdx_row.url, resource["url"]):
- print(
- f" WARN: CDX fuzzy match: {cdx_row.url} != {resource['url']}", file=sys.stderr
- )
+ logging.warn(f"CDX fuzzy match expected={resource['url']} found={cdx_row.url}")
if not cdx_row.status_code:
# TODO: fall back to a full fetch?
print(" WARN: skipping revisit record", file=sys.stderr)