From 336d6000d5007e010476df7a767f6a1361ae2229 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 29 Oct 2020 14:31:36 -0700 Subject: html: initial ingest implementation --- python/sandcrawler/html_ingest.py | 193 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 193 insertions(+) create mode 100644 python/sandcrawler/html_ingest.py (limited to 'python') diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py new file mode 100644 index 0000000..0e1439c --- /dev/null +++ b/python/sandcrawler/html_ingest.py @@ -0,0 +1,193 @@ + +import sys +import json +import datetime +import argparse +from typing import List, Optional, Any + +import trafilatura +import pydantic +from selectolax.parser import HTMLParser + +from sandcrawler.ia import WaybackClient, CdxApiClient, ResourceResult +from sandcrawler.misc import gen_file_metadata +from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules + + +def html_extract_fulltext_teixml(doc: bytes) -> dict: + tei_xml = trafilatura.extract(doc, + tei_output=True, + include_comments=False, + include_formatting=True, + ) + if tei_xml: + return dict(status="success", tei_xml=tei_xml) + else: + return dict(status="empty-xml") + +class WebResource(pydantic.BaseModel): + surt: str + timestamp: datetime.datetime + url: str + sha1hex: str + mimetype: str + status_code: int + size: Optional[int] + sha256hex: Optional[str] + resource_type: Optional[str] + + +def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient, when: Optional[datetime.datetime]) -> List[WebResource]: + """ + This is the lazy version that just does a CDX lookup for each resource. + + Takes a list instead of single record because we may want to circuit break + on failure, and may introduce concurrency internal to this function. + """ + + full = [] + for resource in resources: + cdx_row = cdx_client.lookup_best(resource['url']) + if not cdx_row: + raise Exception("CDX lookup failed") + if cdx_row.url != resource['url']: + pass + #raise Exception( + # f"CDX lookup URL mismatch: {cdx_row.url} != {resource['url']}") + full.append(WebResource( + surt=cdx_row.surt, + timestamp=cdx_row.datetime, + url=cdx_row.url, + sha1hex=cdx_row.sha1hex, + mimetype=cdx_row.mimetype, + status_code=cdx_row.status_code, + size=None, + sha256hex=None, + resource_type=resource['type'], + )) + + return full + + +def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, when: Optional[datetime.datetime]) -> List[WebResource]: + """ + This is the full version which fetches each resource from wayback/petabox + and calculates additional hashes. + + Could make this concurrent in the future, eg: https://realpython.com/python-concurrency/#threading-version + """ + + full = [] + for resource in resources: + wayback_resp = wayback_client.lookup_resource(resource['url']) + if not wayback_resp: + raise Exception("wayback lookup failed") + assert wayback_resp.status == 'success' + if wayback_resp.cdx.url != resource['url']: + pass + #raise Exception( + # f"CDX lookup URL mismatch: {cdx_row.url} != {resource['url']}") + file_meta = gen_file_metadata(wayback_resp.body) + assert file_meta['sha1hex'] == wayback_resp.cdx.sha1hex + full.append(WebResource( + surt=wayback_resp.cdx.surt, + timestamp=wayback_resp.cdx.datetime, + url=wayback_resp.cdx.url, + sha1hex=file_meta['sha1hex'], + mimetype=file_meta['mimetype'], + status_code=wayback_resp.cdx.status_code, + size=file_meta['size_bytes'], + sha256hex=file_meta['sha256hex'], + resource_type=resource['type'], + )) + + return full + + +def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = False) -> None: + + adblock = load_adblock_rules() + wayback_client = WaybackClient() + + html_resource = wayback_client.lookup_resource(url, "text/html") + if html_resource.status != "success": + print(json.dumps(html_resource, indent=2)) + return + + file_meta = gen_file_metadata(html_resource.body) + # XXX: + assert file_meta['mimetype'] == "text/html" + + html_doc = HTMLParser(html_resource.body) + html_meta = html_extract_biblio(html_doc) + html_fulltext = html_extract_fulltext_teixml(html_resource.body) + raw_resources = html_extract_resources(html_resource.terminal_url, html_doc, adblock) + + # XXX: + when = None + + full_resources: List[WebResource] = [] + if quick_mode: + full_resources = quick_fetch_html_resources(raw_resources, wayback_client.cdx_client, when) + else: + full_resources = fetch_html_resources(raw_resources, wayback_client, when) + + output = dict( + status="success", + #html_resource=html_resource, + file_meta=file_meta, + html_fulltext=html_fulltext, + # XXX: + html_meta=html_meta and html_meta.dict(exclude_none=True, exclude={'release_date'}), + resources=[r.dict(exclude_none=True, exclude={'timestamp'}) for r in full_resources], + ) + + print(json.dumps(output, indent=2)) + + +def main() -> None: + """ + Run this command like: + + python -m sandcrawler.html_ingest + """ + + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + subparsers = parser.add_subparsers() + + sub = subparsers.add_parser( + "single", help="tries to ingest a single URL, dumps result to stdout" + ) + sub.set_defaults(func="run_single") + sub.add_argument( + "url", + help="URL to fetch", + type=str, + ) + sub.add_argument( + "--timestamp", + help="timestamp for which to fetch document from wayback", + type=str, + ) + sub.add_argument( + "--quick-mode", + help="don't fetch resources, only do CDX lookup", + action="store_true", + ) + + args = parser.parse_args() + if not args.__dict__.get("func"): + print("tell me what to do! (try --help)") + sys.exit(-1) + + if args.func == "run_single": + run_single(args.url, args.timestamp, args.quick_mode) + else: + #func = getattr(wp, args.func) + #func() + raise NotImplementedError() + +if __name__ == "__main__": + main() -- cgit v1.2.3