diff options
Diffstat (limited to 'python/sandcrawler/ingest_file.py')
-rw-r--r-- | python/sandcrawler/ingest_file.py | 76 |
1 files changed, 33 insertions, 43 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index c79973f..6e2876f 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -1,4 +1,5 @@ import json +import logging import sys import time import xml.etree.ElementTree @@ -286,12 +287,13 @@ class IngestFileWorker(SandcrawlerWorker): ): via = "spn2" resource = self.spn_client.crawl_resource(url, self.wayback_client) - print( - "[FETCH {:>6}] {} {}".format( - via, (resource and resource.status), (resource and resource.terminal_url) or url - ), - file=sys.stderr, - ) + if resource: + print( + f"find_resource {via=} {url=} {resource.status=} {resource.terminal_url=}", + file=sys.stderr, + ) + else: + print(f"find_resource {via=} {url=} status=", file=sys.stderr) return resource def process_existing(self, request: dict, result_row: dict) -> dict: @@ -510,7 +512,7 @@ class IngestFileWorker(SandcrawlerWorker): try: if self.html_quick_mode: - print(" WARN: running quick CDX-only fetches", file=sys.stderr) + logging.warn("running quick CDX-only fetches") full_resources = quick_fetch_html_resources( raw_resources, self.wayback_client.cdx_client, when ) @@ -549,7 +551,7 @@ class IngestFileWorker(SandcrawlerWorker): return info def timeout_response(self, task: dict) -> dict: - print("[TIMEOUT]", file=sys.stderr) + print("ingest-timeout", file=sys.stderr) return dict( request=task, hit=False, @@ -563,7 +565,25 @@ class IngestFileWorker(SandcrawlerWorker): return True def process(self, request: dict, key: Any = None) -> dict: - return self.process_file(request, key=key) + start_time = time.time() + result = self.process_file(request, key=key) + result["duration"] = time.time() - start_time + print(self.canonical_log_line(result), file=sys.stderr) + return result + + def canonical_log_line(self, result: dict) -> str: + request = result["request"] + line = f"ingest-result status={result['status']} ingest_type={request['ingest_type']}" + if result["status"] == "success": + if result.get("file_meta"): + line += f" sha1={result['file_meta']['sha1hex']}" + if result.get("grobid"): + line += f" grobid_status_code={result['grobid'].get('status_code')}" + if result.get("pdf_meta"): + line += f" pdf_status={result['pdf_meta'].get('status')}" + if result.get("duration"): + line += f" duration={result['duration']:.3}" + return line def process_file(self, request: dict, key: Any = None) -> dict: @@ -584,10 +604,9 @@ class IngestFileWorker(SandcrawlerWorker): for block in self.base_url_blocklist: if block in base_url: - print("[SKIP {:>6}] {}".format(ingest_type, base_url), file=sys.stderr) return dict(request=request, hit=False, status="skip-url-blocklist") - print("[INGEST {:>6}] {}".format(ingest_type, base_url), file=sys.stderr) + print(f"ingest-hop {ingest_type=} {base_url=}", file=sys.stderr) best_mimetype = None if ingest_type == "pdf": @@ -756,14 +775,8 @@ class IngestFileWorker(SandcrawlerWorker): next_url = fulltext_url.get("pdf_url") or fulltext_url.get("next_url") or "" assert next_url next_url = clean_url(next_url) - print( - "[PARSE {:>6}] {} {}".format( - ingest_type, - fulltext_url.get("technique"), - next_url, - ), - file=sys.stderr, - ) + technique = fulltext_url.get("technique") + print(f"parse-html {ingest_type=} {technique=} {next_url=}", file=sys.stderr) if next_url in hops: result["status"] = "link-loop" result["error_message"] = "repeated: {}".format(next_url) @@ -788,12 +801,7 @@ class IngestFileWorker(SandcrawlerWorker): next_url = next_url_found technique = "html_biblio" print( - "[PARSE {:>6}] {} {}".format( - ingest_type, - technique, - next_url, - ), - file=sys.stderr, + f"parse-html {ingest_type=} {technique=} {next_url=}", file=sys.stderr ) if next_url in hops: if ingest_type == "html": @@ -867,24 +875,6 @@ class IngestFileWorker(SandcrawlerWorker): result["status"] = "success" result["hit"] = True - if ingest_type == "pdf": - print( - "[SUCCESS {:>5}] sha1:{} grobid:{} pdfextract:{}".format( - ingest_type, - result.get("file_meta", {}).get("sha1hex"), - result.get("grobid", {}).get("status_code"), - result.get("pdf_meta", {}).get("status"), - ), - file=sys.stderr, - ) - else: - print( - "[SUCCESS {:>5}] sha1:{}".format( - ingest_type, - result.get("file_meta", {}).get("sha1hex"), - ), - file=sys.stderr, - ) return result |