diff options
Diffstat (limited to 'python/sandcrawler/ingest_html.py')
-rw-r--r-- | python/sandcrawler/ingest_html.py | 441 |
1 files changed, 441 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest_html.py b/python/sandcrawler/ingest_html.py new file mode 100644 index 0000000..f11cac4 --- /dev/null +++ b/python/sandcrawler/ingest_html.py @@ -0,0 +1,441 @@ + +import io +import sys +import json +import datetime +import argparse +import xml.etree.ElementTree as ET +from typing import List, Optional, Any, Tuple + +import trafilatura +import pydantic +from selectolax.parser import HTMLParser + +from sandcrawler.ia import WaybackClient, CdxApiClient, ResourceResult, cdx_to_dict, fix_transfer_encoding, NoCaptureError, WaybackContentError +from sandcrawler.misc import gen_file_metadata, parse_cdx_datetime, datetime_to_cdx, clean_url, url_fuzzy_equal +from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules + + +TRAFILATURA_AGENT = f"trafilatura/{trafilatura.__version__}" + +def html_extract_body_teixml(doc: bytes) -> dict: + try: + tei_xml = trafilatura.extract(doc, + tei_output=True, + include_comments=False, + include_formatting=True, + ) + except (ValueError, TypeError, Exception) as e: + return dict( + status="trafilatura-parse-error", + error_msg=str(e)[:1000], + ) + if tei_xml: + body_txt = teixml_body_text(tei_xml) + word_count = len(body_txt.split()) + return dict(status="success", agent=TRAFILATURA_AGENT, tei_xml=tei_xml, word_count=word_count) + elif doc.startswith(b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd">'): + # hack for firstmonday.org + return html_extract_body_teixml(doc[106:]) + else: + return dict(status="empty-xml", agent=TRAFILATURA_AGENT) + +def teixml_body_text(doc_xml: str) -> str: + ns = {"tei": "http://www.tei-c.org/ns/1.0"} + tree = ET.fromstring(doc_xml) + body = tree.find('.//tei:body', ns) + if body: + return " ".join(body.itertext()) + else: + return "" + +class WebResource(pydantic.BaseModel): + surt: str + timestamp: datetime.datetime + url: str + sha1hex: str + mimetype: str + status_code: int + size: Optional[int] + sha256hex: Optional[str] + resource_type: Optional[str] + + class Config: + json_encoders = { + datetime.datetime: lambda dt: dt.isoformat() + } + +class IngestWebResult(pydantic.BaseModel): + status: str + hit: bool + error_message: Optional[str] + cdx: Optional[dict] + terminal: Optional[Any] # TODO + request: Optional[Any] # TODO + file_meta: Optional[dict] + html_biblio: Optional[BiblioMetadata] + scope: Optional[str] + html_body: Optional[dict] + html_resources: Optional[List[WebResource]] + + class Config: + arbitrary_types_allowed = True + json_encoders = { + datetime.datetime: lambda dt: dt.isoformat(), + } + +class HtmlMetaRow(pydantic.BaseModel): + sha1hex: str + status: str + scope: Optional[str] + has_teixml: bool + has_thumbnail: bool + word_count: Optional[int] + biblio: Optional[dict] + resources: Optional[List[dict]] + + class Config: + arbitrary_types_allowed = True + json_encoders = { + datetime.datetime: lambda dt: dt.isoformat(), + } + + def to_sql_tuple(self) -> Tuple: + """ + This is for the html_meta SQL table. + """ + return ( + self.sha1hex, + datetime.datetime.now(), # updated + self.status, + self.scope, + self.has_teixml, + self.has_thumbnail, + self.word_count, + (self.biblio or None) and json.dumps(self.biblio, sort_keys=True), + (self.resources or None) and json.dumps(self.resources, sort_keys=True), + ) + + +def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient, when: Optional[datetime.datetime]) -> List[WebResource]: + """ + This is the lazy version that just does a CDX lookup for each resource. + + Takes a list instead of single record because we may want to circuit break + on failure, and may introduce concurrency internal to this function. + """ + + full = [] + closest = when and datetime_to_cdx(when) + for resource in resources: + cdx_row = cdx_client.lookup_best(resource['url'], closest=closest) + if not cdx_row: + raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}") + if cdx_row.url != resource['url'] and not url_fuzzy_equal(cdx_row.url, resource['url']): + print(f" WARN: CDX fuzzy match: {cdx_row.url} != {resource['url']}", file=sys.stderr) + if not cdx_row.status_code: + # TODO: fall back to a full fetch? + print(f" WARN: skipping revisit record", file=sys.stderr) + continue + full.append(WebResource( + surt=cdx_row.surt, + timestamp=cdx_row.datetime, + url=cdx_row.url, + sha1hex=cdx_row.sha1hex, + mimetype=cdx_row.mimetype, + status_code=cdx_row.status_code, + size=None, + sha256hex=None, + resource_type=resource['type'], + )) + + return full + + +def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, when: Optional[datetime.datetime]) -> List[WebResource]: + """ + This is the full version which fetches each resource from wayback/petabox + and calculates additional hashes. + + Could make this concurrent in the future, eg: https://realpython.com/python-concurrency/#threading-version + """ + + full = [] + closest = when and datetime_to_cdx(when) + for resource in resources: + wayback_resp = wayback_client.lookup_resource(resource['url'], closest=closest) + if not wayback_resp or wayback_resp.status != 'success': + raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}") + file_meta = gen_file_metadata(wayback_resp.body, allow_empty=True) + if file_meta['sha1hex'] != wayback_resp.cdx.sha1hex: + raise WaybackContentError(f"wayback payload sha1hex mismatch: {wayback_resp.cdx.datetime} {wayback_resp.cdx.url}") + full.append(WebResource( + surt=wayback_resp.cdx.surt, + timestamp=parse_cdx_datetime(wayback_resp.cdx.datetime), + url=wayback_resp.cdx.url, + sha1hex=file_meta['sha1hex'], + mimetype=file_meta['mimetype'], + status_code=wayback_resp.cdx.status_code or wayback_resp.revisit_cdx.status_code, + size=file_meta['size_bytes'], + sha256hex=file_meta['sha256hex'], + resource_type=resource['type'], + )) + + return full + + +def html_guess_platform(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]) -> Optional[str]: + + generator: Optional[str] = None + generator_elem = doc.css_first("meta[name='generator']") + if generator_elem: + generator = generator_elem.attrs['content'] + else: + generator_elem = doc.css_first("a[id='developedBy']") + if generator_elem: + generator = generator_elem.text() + if generator and "open journal systems 3" in generator.lower(): + return "ojs3" + elif generator and "open journal systems" in generator.lower(): + return "ojs" + elif generator and "plone" in generator.lower(): + return "plone" + elif generator and "wordpress" in generator.lower(): + return "wordpress" + elif generator and "blogger" in generator.lower(): + return "blogger" + elif doc.css_first("body[id='pkp-common-openJournalSystems']"): + return "ojs" + else: + try: + if 'powered by <a target="blank" href="http://pkp.sfu.ca/ojs/">PKP OJS</a>' in doc.html: + return "ojs" + if 'Powered by <a target="_blank" href="http://arphahub.com">' in doc.html: + return "arpha" + if "<meta property='og:image' content='http://cms.galenos.com.tr' />" in doc.html: + return "galenos" + except UnicodeDecodeError: + pass + + icon_elem = doc.css_first("link[type='image/x-icon']") + if icon_elem and 'href' in icon_elem.attrs: + if 'journalssystem.com' in icon_elem.attrs['href']: + return "journalssystem.com" + elif 'indexcopernicus.com' in icon_elem.attrs['href']: + return "indexcopernicus" + + if 'scielo' in url: + return "scielo" + + return None + +def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata], word_count: Optional[int]) -> str: + """ + This function tries to guess if an HTML document represents one of: + + - article-fulltext + - article-abstract + - article-sample + - supplement + - component + - issue-fulltext + - landingpage + - homepage-domain + - blocked-paywall + - blocked-login + - blocked-captcha + - blocked-cookie + - errorpage + - stub + - other + - unknown + + Unknown implies the page could be anything. "other" implies it is not + fulltext or a landing page, but could be one of the other categories. + """ + + # assert that this is a real URL + assert url.count('/') >= 2 + + # basic paywall and loginwall detection based on URL + if url.endswith("/cookieAbsent"): + return "blocked-cookie" + if "://page-one.live.cf.public.springer.com" in url: + return "article-sample" + + if "scielo" in url: + if "sci_abstract" in url: + return "landingpage" + if "sci_arttext" in url: + return "article-fulltext" + + if "showcaptcha.asp" in url: + return "blocked-captcha" + + # is this the top-level URL of the domain? aka, no path? + if url.count('/') <= 2 or (url.count('/') == 3) and url.endswith('/'): + return "homepage-domain" + + platform = html_guess_platform(url, doc, biblio) + + if biblio: + if biblio.html_fulltext_url: + if url_fuzzy_equal(biblio.html_fulltext_url, url): + return "article-fulltext" + else: + return "landingpage" + + # platform-specific detection + if platform in ("ojs", "ojs3"): + + if biblio and biblio.title: + if word_count and word_count > 1200: + return "fulltext" + else: + return "landingpage" + else: + if "/article/view/" in url and word_count and word_count > 600: + return "fulltext" + return "other" + elif platform == "journalssystem.com": + if biblio and biblio.pdf_fulltext_url and word_count and word_count < 1000: + return "landingpage" + + # more platform/publisher specific checks + if "karger.com/Article/Abstract" in url: + return "landingpage" + if "dergipark.gov.tr" in url and not ("download/article-file" in url): + return "other" + + try: + if isinstance(doc.html, str) and "<center><h1>403 Forbidden</h1></center>" in doc.html: + # cloudflare block pattern + return "blocked-forbidden" + except UnicodeDecodeError: + pass + + print(f" scope guessing: platform {platform} word count: {word_count}", file=sys.stderr) + + # fallback: guess based on word count (arbitrary guesses here) + if word_count is not None: + if word_count < 20: + return "stub" + elif word_count > 500 and platform in ['wordpress', 'blogger']: + return "article-fulltext" + elif word_count > 1200: + return "article-fulltext" + + return "unknown" + + +def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = False) -> IngestWebResult: + + adblock = load_adblock_rules() + wayback_client = WaybackClient() + + html_resource = wayback_client.lookup_resource(url, "text/html", closest=timestamp) + if html_resource.status != "success": + return IngestWebResult( + status=html_resource.status, + hit=False, + cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx), + ) + + assert html_resource.terminal_status_code == 200 + + file_meta = gen_file_metadata(html_resource.body) + file_meta, html_resource = fix_transfer_encoding(file_meta, html_resource) + + if file_meta['mimetype'] not in ("text/html", "text/xml"): + return IngestWebResult( + status="wrong-mimetype", + hit=False, + cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx), + file_meta=file_meta, + ) + + html_doc = HTMLParser(html_resource.body) + html_biblio = html_extract_biblio(url, html_doc) + html_body = html_extract_body_teixml(html_resource.body) + html_scope = html_guess_scope(url, html_doc, html_biblio, html_body.get('word_count')) + if html_scope not in ('article-fulltext', 'unknown'): + return IngestWebResult( + status="wrong-scope", + hit=False, + cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx), + file_meta=file_meta, + html_biblio=html_biblio, + scope=html_scope, + ) + + raw_resources = html_extract_resources(html_resource.terminal_url, html_doc, adblock) + assert len(raw_resources) <= 200 + + when = parse_cdx_datetime(html_resource.cdx.datetime) + + full_resources: List[WebResource] = [] + if quick_mode: + full_resources = quick_fetch_html_resources(raw_resources, wayback_client.cdx_client, when) + else: + full_resources = fetch_html_resources(raw_resources, wayback_client, when) + + output = IngestWebResult( + status="success", + hit=True, + cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx), + file_meta=file_meta, + html_body=html_body, + html_biblio=html_biblio, + scope=html_scope, + html_resources=full_resources, + ) + return output + + +def main() -> None: + """ + Run this command like: + + python -m sandcrawler.html_ingest + """ + + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + subparsers = parser.add_subparsers() + + sub = subparsers.add_parser( + "single", help="tries to ingest a single URL, dumps result to stdout" + ) + sub.set_defaults(func="run_single") + sub.add_argument( + "url", + help="URL to fetch", + type=str, + ) + sub.add_argument( + "--timestamp", + help="timestamp for which to fetch document from wayback", + type=str, + ) + sub.add_argument( + "--quick-mode", + help="don't fetch resources, only do CDX lookup", + action="store_true", + ) + + args = parser.parse_args() + if not args.__dict__.get("func"): + parser.print_help(file=sys.stderr) + sys.exit(-1) + + if args.func == "run_single": + result = run_single(args.url, args.timestamp, args.quick_mode) + print(result.json(indent=2, exclude_none=True)) + else: + #func = getattr(wp, args.func) + #func() + raise NotImplementedError() + +if __name__ == "__main__": + main() |