aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-10-30 15:17:14 -0700
committerBryan Newbold <bnewbold@archive.org>2020-10-30 15:17:14 -0700
commit08bf16e6da9666bb81e4d1ecddff48fe7cf9205c (patch)
tree41552977a735d13152a6fe01704b839633e121c1
parent24bfdfaa260156e395c509f0c18657e79dc6f730 (diff)
downloadsandcrawler-08bf16e6da9666bb81e4d1ecddff48fe7cf9205c.tar.gz
sandcrawler-08bf16e6da9666bb81e4d1ecddff48fe7cf9205c.zip
html: more ingest improvements
-rw-r--r--python/sandcrawler/html_ingest.py136
-rw-r--r--python/sandcrawler/html_metadata.py2
2 files changed, 120 insertions, 18 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py
index f28231e..e86fa2b 100644
--- a/python/sandcrawler/html_ingest.py
+++ b/python/sandcrawler/html_ingest.py
@@ -1,16 +1,19 @@
+import io
import sys
+import gzip
import json
import datetime
import argparse
+import xml.etree.ElementTree as ET
from typing import List, Optional, Any
import trafilatura
import pydantic
from selectolax.parser import HTMLParser
-from sandcrawler.ia import WaybackClient, CdxApiClient, ResourceResult
-from sandcrawler.misc import gen_file_metadata
+from sandcrawler.ia import WaybackClient, CdxApiClient, ResourceResult, cdx_to_dict
+from sandcrawler.misc import gen_file_metadata, parse_cdx_datetime, datetime_to_cdx
from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules
@@ -25,6 +28,15 @@ def html_extract_fulltext_teixml(doc: bytes) -> dict:
else:
return dict(status="empty-xml")
+def teixml_body_text(doc_xml: str) -> str:
+ ns = {"tei": "http://www.tei-c.org/ns/1.0"}
+ tree = ET.fromstring(doc_xml)
+ body = tree.find('.//tei:body', ns)
+ if body:
+ return " ".join(body.itertext())
+ else:
+ return ""
+
class WebResource(pydantic.BaseModel):
surt: str
timestamp: datetime.datetime
@@ -43,19 +55,45 @@ class WebResource(pydantic.BaseModel):
class IngestWebResult(pydantic.BaseModel):
status: str
+ hit: bool
+ cdx: Optional[dict]
+ terminal: Optional[Any] # TODO
request: Optional[Any] # TODO
- html_resource: Optional[ResourceResult]
file_meta: Optional[dict]
+ html_biblio: Optional[BiblioMetadata]
+ html_scope: Optional[str]
html_fulltext: Optional[dict]
- html_meta: Optional[BiblioMetadata]
subresources: Optional[List[WebResource]]
class Config:
+ arbitrary_types_allowed = True
json_encoders = {
- datetime.datetime: lambda dt: dt.isoformat()
+ datetime.datetime: lambda dt: dt.isoformat(),
}
+def fix_transfer_encoding(file_meta: dict, resource: ResourceResult) -> (dict, ResourceResult):
+ if file_meta['mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip':
+ print("transfer encoding not stripped: {}".format(resource.cdx.mimetype), file=sys.stderr)
+ inner_body = gzip.decompress(resource.body)
+ inner_resource = ResourceResult(
+ body=inner_body,
+ # copy all other fields
+ start_url=resource.start_url,
+ hit=resource.hit,
+ status=resource.status,
+ terminal_url=resource.terminal_url,
+ terminal_dt=resource.terminal_dt,
+ terminal_status_code=resource.terminal_status_code,
+ cdx=resource.cdx,
+ revisit_cdx=resource.revisit_cdx,
+ )
+ inner_file_meta = gen_file_metadata(inner_resource.body)
+ return (inner_file_meta, inner_resource)
+ else:
+ return (file_meta, resource)
+
+
def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient, when: Optional[datetime.datetime]) -> List[WebResource]:
"""
This is the lazy version that just does a CDX lookup for each resource.
@@ -65,8 +103,9 @@ def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient,
"""
full = []
+ closest = when and datetime_to_cdx(when)
for resource in resources:
- cdx_row = cdx_client.lookup_best(resource['url'])
+ cdx_row = cdx_client.lookup_best(resource['url'], closest=closest)
if not cdx_row:
raise Exception("CDX lookup failed")
if cdx_row.url != resource['url']:
@@ -97,8 +136,9 @@ def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, w
"""
full = []
+ closest = when and datetime_to_cdx(when)
for resource in resources:
- wayback_resp = wayback_client.lookup_resource(resource['url'])
+ wayback_resp = wayback_client.lookup_resource(resource['url'], closest=closest)
if not wayback_resp:
raise Exception("wayback lookup failed")
# XXX
@@ -108,7 +148,7 @@ def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, w
raise Exception("wayback payload sha1hex mismatch")
full.append(WebResource(
surt=wayback_resp.cdx.surt,
- timestamp=wayback_resp.cdx.datetime,
+ timestamp=parse_cdx_datetime(wayback_resp.cdx.datetime),
url=wayback_resp.cdx.url,
sha1hex=file_meta['sha1hex'],
mimetype=file_meta['mimetype'],
@@ -121,34 +161,92 @@ def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, w
return full
+def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata], tei_xml: Optional[str]) -> str:
+ """
+ This function tries to guess if an HTML document represents one of:
+
+ - article-fulltext
+ - article-abstract
+ - article-sample
+ - supplement
+ - component
+ - issue-fulltext
+ - landingpage
+ - paywall
+ - loginwall
+ - blockpage
+ - errorpage
+ - stub
+ - unknown
+ """
+
+ # basic paywall and loginwall detection based on URL
+ if url.endswith("/cookieAbsent"):
+ return "blockpage"
+ if "://page-one.live.cf.public.springer.com" in url:
+ return "article-sample"
+
+ if biblio and biblio.html_fulltext_url == url:
+ return "article-fulltext"
+
+ # fallback: guess based word count (arbitrary guesses here)
+ if not tei_xml:
+ return "unknown"
+ body_txt = teixml_body_text(tei_xml)
+ word_count = len(body_txt.split())
+ #print(f" body text word count: {word_count}", file=sys.stderr)
+ if word_count < 20:
+ return "stub"
+ elif word_count > 800:
+ return "article-fulltext"
+
+ return "unknown"
+
+
def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = False) -> IngestWebResult:
adblock = load_adblock_rules()
wayback_client = WaybackClient()
- html_resource = wayback_client.lookup_resource(url, "text/html")
+ html_resource = wayback_client.lookup_resource(url, "text/html", closest=timestamp)
if html_resource.status != "success":
return IngestWebResult(
status=html_resource.status,
- html_resource=html_resource,
+ hit=False,
+ cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx),
)
+ assert html_resource.terminal_status_code == 200
+
file_meta = gen_file_metadata(html_resource.body)
+ file_meta, html_resource = fix_transfer_encoding(file_meta, html_resource)
- if file_meta['mimetype'] != "text/html":
+ if file_meta['mimetype'] not in ("text/html", "text/xml"):
return IngestWebResult(
status="wrong-mimetype",
- html_resource=html_resource,
+ hit=False,
+ cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx),
file_meta=file_meta,
)
html_doc = HTMLParser(html_resource.body)
- html_meta = html_extract_biblio(html_doc)
+ html_biblio = html_extract_biblio(html_doc)
html_fulltext = html_extract_fulltext_teixml(html_resource.body)
+ html_scope = html_guess_scope(url, html_doc, html_biblio, html_fulltext.get('tei_xml'))
+ if html_scope not in ('article-fulltext', 'unknown'):
+ return IngestWebResult(
+ status="wrong-scope",
+ hit=False,
+ cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx),
+ file_meta=file_meta,
+ html_biblio=html_biblio,
+ html_scope=html_scope,
+ )
+
raw_resources = html_extract_resources(html_resource.terminal_url, html_doc, adblock)
+ assert len(raw_resources) <= 200
- # XXX:
- when = None
+ when = parse_cdx_datetime(html_resource.cdx.datetime)
full_resources: List[WebResource] = []
if quick_mode:
@@ -158,10 +256,12 @@ def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = Fal
output = IngestWebResult(
status="success",
- html_resource=html_resource,
+ hit=True,
+ cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx),
file_meta=file_meta,
html_fulltext=html_fulltext,
- html_meta=html_meta,
+ html_biblio=html_biblio,
+ html_scope=html_scope,
subresources=full_resources,
)
return output
@@ -206,7 +306,7 @@ def main() -> None:
if args.func == "run_single":
result = run_single(args.url, args.timestamp, args.quick_mode)
- print(result.json(indent=2))
+ print(result.json(indent=2, exclude_none=True))
else:
#func = getattr(wp, args.func)
#func()
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 6b1bdef..d3ca1b7 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -14,6 +14,8 @@ import braveblock
# - google scholar crawling notes (https://scholar.google.com/intl/ja/scholar/inclusion.html#indexing)
# - inspection of actual publisher HTML
# - http://div.div1.com.au/div-thoughts/div-commentaries/66-div-commentary-metadata
+# - "HTML meta tags used by journal articles"
+# https://gist.github.com/hubgit/5985963
# order of these are mostly by preference/quality (best option first), though
# also/sometimes re-ordered for lookup efficiency (lookup stops after first
# match)