aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/html_metadata.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-07-12 15:03:29 -0700
committerBryan Newbold <bnewbold@archive.org>2022-07-12 15:03:29 -0700
commitc15432c0ce52c48efabcd7e3221a5d625ef3e9d0 (patch)
tree948d8e72a7235d07a5ea5d778e8ee388003418e6 /python/sandcrawler/html_metadata.py
parent8f85ab294eae50e31efa9e31bb0bca1bca76cf8b (diff)
downloadsandcrawler-bnewbold-refactor-loggging.tar.gz
sandcrawler-bnewbold-refactor-loggging.zip
WIP: refactor logging calls in ingest pipelinesbnewbold-refactor-loggging
Diffstat (limited to 'python/sandcrawler/html_metadata.py')
-rw-r--r--python/sandcrawler/html_metadata.py6
1 files changed, 3 insertions, 3 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 2fb500c..cee3b00 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -1,5 +1,5 @@
import datetime
-import sys
+import logging
import urllib.parse
from typing import Any, Dict, List, Optional, Tuple
@@ -726,7 +726,7 @@ def html_extract_fulltext_url(
continue
return (val, pattern.get("technique", "unknown"))
if self_doc_url:
- print(" WARN: returning fulltext URL pointing to self", file=sys.stderr)
+ logging.warn(f"returning fulltext URL pointing to self {self_doc_url=}")
return self_doc_url
return None
@@ -736,7 +736,7 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat
meta: Any = dict()
head = doc.css_first("head")
if not head:
- print(f"WARN: empty <head>? {doc_url}", file=sys.stderr)
+ logging.warn(f"empty HTML head {doc_url=}")
return None
for field, patterns in HEAD_META_PATTERNS.items():