diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-10-30 15:17:14 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-10-30 15:17:14 -0700 |
commit | 08bf16e6da9666bb81e4d1ecddff48fe7cf9205c (patch) | |
tree | 41552977a735d13152a6fe01704b839633e121c1 | |
parent | 24bfdfaa260156e395c509f0c18657e79dc6f730 (diff) | |
download | sandcrawler-08bf16e6da9666bb81e4d1ecddff48fe7cf9205c.tar.gz sandcrawler-08bf16e6da9666bb81e4d1ecddff48fe7cf9205c.zip |
html: more ingest improvements
-rw-r--r-- | python/sandcrawler/html_ingest.py | 136 | ||||
-rw-r--r-- | python/sandcrawler/html_metadata.py | 2 |
2 files changed, 120 insertions, 18 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py index f28231e..e86fa2b 100644 --- a/python/sandcrawler/html_ingest.py +++ b/python/sandcrawler/html_ingest.py @@ -1,16 +1,19 @@ +import io import sys +import gzip import json import datetime import argparse +import xml.etree.ElementTree as ET from typing import List, Optional, Any import trafilatura import pydantic from selectolax.parser import HTMLParser -from sandcrawler.ia import WaybackClient, CdxApiClient, ResourceResult -from sandcrawler.misc import gen_file_metadata +from sandcrawler.ia import WaybackClient, CdxApiClient, ResourceResult, cdx_to_dict +from sandcrawler.misc import gen_file_metadata, parse_cdx_datetime, datetime_to_cdx from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules @@ -25,6 +28,15 @@ def html_extract_fulltext_teixml(doc: bytes) -> dict: else: return dict(status="empty-xml") +def teixml_body_text(doc_xml: str) -> str: + ns = {"tei": "http://www.tei-c.org/ns/1.0"} + tree = ET.fromstring(doc_xml) + body = tree.find('.//tei:body', ns) + if body: + return " ".join(body.itertext()) + else: + return "" + class WebResource(pydantic.BaseModel): surt: str timestamp: datetime.datetime @@ -43,19 +55,45 @@ class WebResource(pydantic.BaseModel): class IngestWebResult(pydantic.BaseModel): status: str + hit: bool + cdx: Optional[dict] + terminal: Optional[Any] # TODO request: Optional[Any] # TODO - html_resource: Optional[ResourceResult] file_meta: Optional[dict] + html_biblio: Optional[BiblioMetadata] + html_scope: Optional[str] html_fulltext: Optional[dict] - html_meta: Optional[BiblioMetadata] subresources: Optional[List[WebResource]] class Config: + arbitrary_types_allowed = True json_encoders = { - datetime.datetime: lambda dt: dt.isoformat() + datetime.datetime: lambda dt: dt.isoformat(), } +def fix_transfer_encoding(file_meta: dict, resource: ResourceResult) -> (dict, ResourceResult): + if file_meta['mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip': + print("transfer encoding not stripped: {}".format(resource.cdx.mimetype), file=sys.stderr) + inner_body = gzip.decompress(resource.body) + inner_resource = ResourceResult( + body=inner_body, + # copy all other fields + start_url=resource.start_url, + hit=resource.hit, + status=resource.status, + terminal_url=resource.terminal_url, + terminal_dt=resource.terminal_dt, + terminal_status_code=resource.terminal_status_code, + cdx=resource.cdx, + revisit_cdx=resource.revisit_cdx, + ) + inner_file_meta = gen_file_metadata(inner_resource.body) + return (inner_file_meta, inner_resource) + else: + return (file_meta, resource) + + def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient, when: Optional[datetime.datetime]) -> List[WebResource]: """ This is the lazy version that just does a CDX lookup for each resource. @@ -65,8 +103,9 @@ def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient, """ full = [] + closest = when and datetime_to_cdx(when) for resource in resources: - cdx_row = cdx_client.lookup_best(resource['url']) + cdx_row = cdx_client.lookup_best(resource['url'], closest=closest) if not cdx_row: raise Exception("CDX lookup failed") if cdx_row.url != resource['url']: @@ -97,8 +136,9 @@ def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, w """ full = [] + closest = when and datetime_to_cdx(when) for resource in resources: - wayback_resp = wayback_client.lookup_resource(resource['url']) + wayback_resp = wayback_client.lookup_resource(resource['url'], closest=closest) if not wayback_resp: raise Exception("wayback lookup failed") # XXX @@ -108,7 +148,7 @@ def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, w raise Exception("wayback payload sha1hex mismatch") full.append(WebResource( surt=wayback_resp.cdx.surt, - timestamp=wayback_resp.cdx.datetime, + timestamp=parse_cdx_datetime(wayback_resp.cdx.datetime), url=wayback_resp.cdx.url, sha1hex=file_meta['sha1hex'], mimetype=file_meta['mimetype'], @@ -121,34 +161,92 @@ def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, w return full +def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata], tei_xml: Optional[str]) -> str: + """ + This function tries to guess if an HTML document represents one of: + + - article-fulltext + - article-abstract + - article-sample + - supplement + - component + - issue-fulltext + - landingpage + - paywall + - loginwall + - blockpage + - errorpage + - stub + - unknown + """ + + # basic paywall and loginwall detection based on URL + if url.endswith("/cookieAbsent"): + return "blockpage" + if "://page-one.live.cf.public.springer.com" in url: + return "article-sample" + + if biblio and biblio.html_fulltext_url == url: + return "article-fulltext" + + # fallback: guess based word count (arbitrary guesses here) + if not tei_xml: + return "unknown" + body_txt = teixml_body_text(tei_xml) + word_count = len(body_txt.split()) + #print(f" body text word count: {word_count}", file=sys.stderr) + if word_count < 20: + return "stub" + elif word_count > 800: + return "article-fulltext" + + return "unknown" + + def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = False) -> IngestWebResult: adblock = load_adblock_rules() wayback_client = WaybackClient() - html_resource = wayback_client.lookup_resource(url, "text/html") + html_resource = wayback_client.lookup_resource(url, "text/html", closest=timestamp) if html_resource.status != "success": return IngestWebResult( status=html_resource.status, - html_resource=html_resource, + hit=False, + cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx), ) + assert html_resource.terminal_status_code == 200 + file_meta = gen_file_metadata(html_resource.body) + file_meta, html_resource = fix_transfer_encoding(file_meta, html_resource) - if file_meta['mimetype'] != "text/html": + if file_meta['mimetype'] not in ("text/html", "text/xml"): return IngestWebResult( status="wrong-mimetype", - html_resource=html_resource, + hit=False, + cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx), file_meta=file_meta, ) html_doc = HTMLParser(html_resource.body) - html_meta = html_extract_biblio(html_doc) + html_biblio = html_extract_biblio(html_doc) html_fulltext = html_extract_fulltext_teixml(html_resource.body) + html_scope = html_guess_scope(url, html_doc, html_biblio, html_fulltext.get('tei_xml')) + if html_scope not in ('article-fulltext', 'unknown'): + return IngestWebResult( + status="wrong-scope", + hit=False, + cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx), + file_meta=file_meta, + html_biblio=html_biblio, + html_scope=html_scope, + ) + raw_resources = html_extract_resources(html_resource.terminal_url, html_doc, adblock) + assert len(raw_resources) <= 200 - # XXX: - when = None + when = parse_cdx_datetime(html_resource.cdx.datetime) full_resources: List[WebResource] = [] if quick_mode: @@ -158,10 +256,12 @@ def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = Fal output = IngestWebResult( status="success", - html_resource=html_resource, + hit=True, + cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx), file_meta=file_meta, html_fulltext=html_fulltext, - html_meta=html_meta, + html_biblio=html_biblio, + html_scope=html_scope, subresources=full_resources, ) return output @@ -206,7 +306,7 @@ def main() -> None: if args.func == "run_single": result = run_single(args.url, args.timestamp, args.quick_mode) - print(result.json(indent=2)) + print(result.json(indent=2, exclude_none=True)) else: #func = getattr(wp, args.func) #func() diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 6b1bdef..d3ca1b7 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -14,6 +14,8 @@ import braveblock # - google scholar crawling notes (https://scholar.google.com/intl/ja/scholar/inclusion.html#indexing) # - inspection of actual publisher HTML # - http://div.div1.com.au/div-thoughts/div-commentaries/66-div-commentary-metadata +# - "HTML meta tags used by journal articles" +# https://gist.github.com/hubgit/5985963 # order of these are mostly by preference/quality (best option first), though # also/sometimes re-ordered for lookup efficiency (lookup stops after first # match) |