aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-10-29 14:31:36 -0700
committerBryan Newbold <bnewbold@archive.org>2020-10-29 14:31:36 -0700
commit336d6000d5007e010476df7a767f6a1361ae2229 (patch)
treef07d2a18d3665e874a1b5ce9dafc1917b45d218e /python
parent3d56509ef83226a808ebb078f5cac9815afb5d9d (diff)
downloadsandcrawler-336d6000d5007e010476df7a767f6a1361ae2229.tar.gz
sandcrawler-336d6000d5007e010476df7a767f6a1361ae2229.zip
html: initial ingest implementation
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/html_ingest.py193
1 files changed, 193 insertions, 0 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py
new file mode 100644
index 0000000..0e1439c
--- /dev/null
+++ b/python/sandcrawler/html_ingest.py
@@ -0,0 +1,193 @@
+
+import sys
+import json
+import datetime
+import argparse
+from typing import List, Optional, Any
+
+import trafilatura
+import pydantic
+from selectolax.parser import HTMLParser
+
+from sandcrawler.ia import WaybackClient, CdxApiClient, ResourceResult
+from sandcrawler.misc import gen_file_metadata
+from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules
+
+
+def html_extract_fulltext_teixml(doc: bytes) -> dict:
+ tei_xml = trafilatura.extract(doc,
+ tei_output=True,
+ include_comments=False,
+ include_formatting=True,
+ )
+ if tei_xml:
+ return dict(status="success", tei_xml=tei_xml)
+ else:
+ return dict(status="empty-xml")
+
+class WebResource(pydantic.BaseModel):
+ surt: str
+ timestamp: datetime.datetime
+ url: str
+ sha1hex: str
+ mimetype: str
+ status_code: int
+ size: Optional[int]
+ sha256hex: Optional[str]
+ resource_type: Optional[str]
+
+
+def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient, when: Optional[datetime.datetime]) -> List[WebResource]:
+ """
+ This is the lazy version that just does a CDX lookup for each resource.
+
+ Takes a list instead of single record because we may want to circuit break
+ on failure, and may introduce concurrency internal to this function.
+ """
+
+ full = []
+ for resource in resources:
+ cdx_row = cdx_client.lookup_best(resource['url'])
+ if not cdx_row:
+ raise Exception("CDX lookup failed")
+ if cdx_row.url != resource['url']:
+ pass
+ #raise Exception(
+ # f"CDX lookup URL mismatch: {cdx_row.url} != {resource['url']}")
+ full.append(WebResource(
+ surt=cdx_row.surt,
+ timestamp=cdx_row.datetime,
+ url=cdx_row.url,
+ sha1hex=cdx_row.sha1hex,
+ mimetype=cdx_row.mimetype,
+ status_code=cdx_row.status_code,
+ size=None,
+ sha256hex=None,
+ resource_type=resource['type'],
+ ))
+
+ return full
+
+
+def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, when: Optional[datetime.datetime]) -> List[WebResource]:
+ """
+ This is the full version which fetches each resource from wayback/petabox
+ and calculates additional hashes.
+
+ Could make this concurrent in the future, eg: https://realpython.com/python-concurrency/#threading-version
+ """
+
+ full = []
+ for resource in resources:
+ wayback_resp = wayback_client.lookup_resource(resource['url'])
+ if not wayback_resp:
+ raise Exception("wayback lookup failed")
+ assert wayback_resp.status == 'success'
+ if wayback_resp.cdx.url != resource['url']:
+ pass
+ #raise Exception(
+ # f"CDX lookup URL mismatch: {cdx_row.url} != {resource['url']}")
+ file_meta = gen_file_metadata(wayback_resp.body)
+ assert file_meta['sha1hex'] == wayback_resp.cdx.sha1hex
+ full.append(WebResource(
+ surt=wayback_resp.cdx.surt,
+ timestamp=wayback_resp.cdx.datetime,
+ url=wayback_resp.cdx.url,
+ sha1hex=file_meta['sha1hex'],
+ mimetype=file_meta['mimetype'],
+ status_code=wayback_resp.cdx.status_code,
+ size=file_meta['size_bytes'],
+ sha256hex=file_meta['sha256hex'],
+ resource_type=resource['type'],
+ ))
+
+ return full
+
+
+def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = False) -> None:
+
+ adblock = load_adblock_rules()
+ wayback_client = WaybackClient()
+
+ html_resource = wayback_client.lookup_resource(url, "text/html")
+ if html_resource.status != "success":
+ print(json.dumps(html_resource, indent=2))
+ return
+
+ file_meta = gen_file_metadata(html_resource.body)
+ # XXX:
+ assert file_meta['mimetype'] == "text/html"
+
+ html_doc = HTMLParser(html_resource.body)
+ html_meta = html_extract_biblio(html_doc)
+ html_fulltext = html_extract_fulltext_teixml(html_resource.body)
+ raw_resources = html_extract_resources(html_resource.terminal_url, html_doc, adblock)
+
+ # XXX:
+ when = None
+
+ full_resources: List[WebResource] = []
+ if quick_mode:
+ full_resources = quick_fetch_html_resources(raw_resources, wayback_client.cdx_client, when)
+ else:
+ full_resources = fetch_html_resources(raw_resources, wayback_client, when)
+
+ output = dict(
+ status="success",
+ #html_resource=html_resource,
+ file_meta=file_meta,
+ html_fulltext=html_fulltext,
+ # XXX:
+ html_meta=html_meta and html_meta.dict(exclude_none=True, exclude={'release_date'}),
+ resources=[r.dict(exclude_none=True, exclude={'timestamp'}) for r in full_resources],
+ )
+
+ print(json.dumps(output, indent=2))
+
+
+def main() -> None:
+ """
+ Run this command like:
+
+ python -m sandcrawler.html_ingest
+ """
+
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
+ )
+ subparsers = parser.add_subparsers()
+
+ sub = subparsers.add_parser(
+ "single", help="tries to ingest a single URL, dumps result to stdout"
+ )
+ sub.set_defaults(func="run_single")
+ sub.add_argument(
+ "url",
+ help="URL to fetch",
+ type=str,
+ )
+ sub.add_argument(
+ "--timestamp",
+ help="timestamp for which to fetch document from wayback",
+ type=str,
+ )
+ sub.add_argument(
+ "--quick-mode",
+ help="don't fetch resources, only do CDX lookup",
+ action="store_true",
+ )
+
+ args = parser.parse_args()
+ if not args.__dict__.get("func"):
+ print("tell me what to do! (try --help)")
+ sys.exit(-1)
+
+ if args.func == "run_single":
+ run_single(args.url, args.timestamp, args.quick_mode)
+ else:
+ #func = getattr(wp, args.func)
+ #func()
+ raise NotImplementedError()
+
+if __name__ == "__main__":
+ main()