diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-07-12 15:03:29 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-07-12 15:03:29 -0700 |
commit | c15432c0ce52c48efabcd7e3221a5d625ef3e9d0 (patch) | |
tree | 948d8e72a7235d07a5ea5d778e8ee388003418e6 /python/sandcrawler/ingest_html.py | |
parent | 8f85ab294eae50e31efa9e31bb0bca1bca76cf8b (diff) | |
download | sandcrawler-bnewbold-refactor-loggging.tar.gz sandcrawler-bnewbold-refactor-loggging.zip |
WIP: refactor logging calls in ingest pipelinesbnewbold-refactor-loggging
Diffstat (limited to 'python/sandcrawler/ingest_html.py')
-rw-r--r-- | python/sandcrawler/ingest_html.py | 5 |
1 files changed, 2 insertions, 3 deletions
diff --git a/python/sandcrawler/ingest_html.py b/python/sandcrawler/ingest_html.py index 0ff7fe0..3989f9e 100644 --- a/python/sandcrawler/ingest_html.py +++ b/python/sandcrawler/ingest_html.py @@ -1,6 +1,7 @@ import argparse import datetime import json +import logging import sys import xml.etree.ElementTree as ET from typing import Any, List, Optional, Tuple @@ -156,9 +157,7 @@ def quick_fetch_html_resources( if not cdx_row: raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}") if cdx_row.url != resource["url"] and not url_fuzzy_equal(cdx_row.url, resource["url"]): - print( - f" WARN: CDX fuzzy match: {cdx_row.url} != {resource['url']}", file=sys.stderr - ) + logging.warn(f"CDX fuzzy match expected={resource['url']} found={cdx_row.url}") if not cdx_row.status_code: # TODO: fall back to a full fetch? print(" WARN: skipping revisit record", file=sys.stderr) |