html: initial ingest implementation

author: Bryan Newbold <bnewbold@archive.org> 2020-10-29 14:31:36 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-10-29 14:31:36 -0700
commit: 336d6000d5007e010476df7a767f6a1361ae2229 (patch)
tree: f07d2a18d3665e874a1b5ce9dafc1917b45d218e
parent: 3d56509ef83226a808ebb078f5cac9815afb5d9d (diff)
download: sandcrawler-336d6000d5007e010476df7a767f6a1361ae2229.tar.gz
sandcrawler-336d6000d5007e010476df7a767f6a1361ae2229.zip
1 files changed, 193 insertions, 0 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py
new file mode 100644
index 0000000..0e1439c
--- /dev/null
+++ b/python/sandcrawler/html_ingest.py
@@ -0,0 +1,193 @@
+
+import sys
+import json
+import datetime
+import argparse
+from typing import List, Optional, Any
+
+import trafilatura
+import pydantic
+from selectolax.parser import HTMLParser
+
+from sandcrawler.ia import WaybackClient, CdxApiClient, ResourceResult
+from sandcrawler.misc import gen_file_metadata
+from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules
+
+
+def html_extract_fulltext_teixml(doc: bytes) -> dict:
+    tei_xml = trafilatura.extract(doc,
+        tei_output=True,
+        include_comments=False,
+        include_formatting=True,
+    )
+    if tei_xml:
+        return dict(status="success", tei_xml=tei_xml)
+    else:
+        return dict(status="empty-xml")
+
+class WebResource(pydantic.BaseModel):
+    surt: str
+    timestamp: datetime.datetime
+    url: str
+    sha1hex: str
+    mimetype: str
+    status_code: int
+    size: Optional[int]
+    sha256hex: Optional[str]
+    resource_type: Optional[str]
+
+
+def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient, when: Optional[datetime.datetime]) -> List[WebResource]:
+    """
+    This is the lazy version that just does a CDX lookup for each resource.
+
+    Takes a list instead of single record because we may want to circuit break
+    on failure, and may introduce concurrency internal to this function.
+    """
+
+    full = []
+    for resource in resources:
+        cdx_row = cdx_client.lookup_best(resource['url'])
+        if not cdx_row:
+            raise Exception("CDX lookup failed")
+        if cdx_row.url != resource['url']:
+            pass
+            #raise Exception(
+            #    f"CDX lookup URL mismatch: {cdx_row.url} != {resource['url']}")
+        full.append(WebResource(
+            surt=cdx_row.surt,
+            timestamp=cdx_row.datetime,
+            url=cdx_row.url,
+            sha1hex=cdx_row.sha1hex,
+            mimetype=cdx_row.mimetype,
+            status_code=cdx_row.status_code,
+            size=None,
+            sha256hex=None,
+            resource_type=resource['type'],
+        ))
+
+    return full
+
+
+def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, when: Optional[datetime.datetime]) -> List[WebResource]:
+    """
+    This is the full version which fetches each resource from wayback/petabox
+    and calculates additional hashes.
+
+    Could make this concurrent in the future, eg: https://realpython.com/python-concurrency/#threading-version
+    """
+
+    full = []
+    for resource in resources:
+        wayback_resp = wayback_client.lookup_resource(resource['url'])
+        if not wayback_resp:
+            raise Exception("wayback lookup failed")
+        assert wayback_resp.status == 'success'
+        if wayback_resp.cdx.url != resource['url']:
+            pass
+            #raise Exception(
+            #    f"CDX lookup URL mismatch: {cdx_row.url} != {resource['url']}")
+        file_meta = gen_file_metadata(wayback_resp.body)
+        assert file_meta['sha1hex'] == wayback_resp.cdx.sha1hex
+        full.append(WebResource(
+            surt=wayback_resp.cdx.surt,
+            timestamp=wayback_resp.cdx.datetime,
+            url=wayback_resp.cdx.url,
+            sha1hex=file_meta['sha1hex'],
+            mimetype=file_meta['mimetype'],
+            status_code=wayback_resp.cdx.status_code,
+            size=file_meta['size_bytes'],
+            sha256hex=file_meta['sha256hex'],
+            resource_type=resource['type'],
+        ))
+
+    return full
+
+
+def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = False) -> None:
+
+    adblock = load_adblock_rules()
+    wayback_client = WaybackClient()
+
+    html_resource = wayback_client.lookup_resource(url, "text/html")
+    if html_resource.status != "success":
+        print(json.dumps(html_resource, indent=2))
+        return
+
+    file_meta = gen_file_metadata(html_resource.body)
+    # XXX:
+    assert file_meta['mimetype'] == "text/html"
+
+    html_doc = HTMLParser(html_resource.body)
+    html_meta = html_extract_biblio(html_doc)
+    html_fulltext = html_extract_fulltext_teixml(html_resource.body)
+    raw_resources = html_extract_resources(html_resource.terminal_url, html_doc, adblock)
+
+    # XXX:
+    when = None
+
+    full_resources: List[WebResource] = []
+    if quick_mode:
+        full_resources = quick_fetch_html_resources(raw_resources, wayback_client.cdx_client, when)
+    else:
+        full_resources = fetch_html_resources(raw_resources, wayback_client, when)
+
+    output = dict(
+        status="success",
+        #html_resource=html_resource,
+        file_meta=file_meta,
+        html_fulltext=html_fulltext,
+        # XXX:
+        html_meta=html_meta and html_meta.dict(exclude_none=True, exclude={'release_date'}),
+        resources=[r.dict(exclude_none=True, exclude={'timestamp'}) for r in full_resources],
+    )
+
+    print(json.dumps(output, indent=2))
+
+
+def main() -> None:
+    """
+    Run this command like:
+
+        python -m sandcrawler.html_ingest
+    """
+
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    subparsers = parser.add_subparsers()
+
+    sub = subparsers.add_parser(
+        "single", help="tries to ingest a single URL, dumps result to stdout"
+    )
+    sub.set_defaults(func="run_single")
+    sub.add_argument(
+        "url",
+        help="URL to fetch",
+        type=str,
+    )
+    sub.add_argument(
+        "--timestamp",
+        help="timestamp for which to fetch document from wayback",
+        type=str,
+    )
+    sub.add_argument(
+        "--quick-mode",
+        help="don't fetch resources, only do CDX lookup",
+        action="store_true",
+    )
+
+    args = parser.parse_args()
+    if not args.__dict__.get("func"):
+        print("tell me what to do! (try --help)")
+        sys.exit(-1)
+
+    if args.func == "run_single":
+        run_single(args.url, args.timestamp, args.quick_mode)
+    else:
+        #func = getattr(wp, args.func)
+        #func()
+        raise NotImplementedError()
+
+if __name__ == "__main__":
+    main()
author	Bryan Newbold <bnewbold@archive.org>	2020-10-29 14:31:36 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-10-29 14:31:36 -0700
commit	336d6000d5007e010476df7a767f6a1361ae2229 (patch)
tree	f07d2a18d3665e874a1b5ce9dafc1917b45d218e
parent	3d56509ef83226a808ebb078f5cac9815afb5d9d (diff)
download	sandcrawler-336d6000d5007e010476df7a767f6a1361ae2229.tar.gz sandcrawler-336d6000d5007e010476df7a767f6a1361ae2229.zip