aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ingest_html.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/ingest_html.py')
-rw-r--r--python/sandcrawler/ingest_html.py139
1 files changed, 82 insertions, 57 deletions
diff --git a/python/sandcrawler/ingest_html.py b/python/sandcrawler/ingest_html.py
index 91e5c6e..0ff7fe0 100644
--- a/python/sandcrawler/ingest_html.py
+++ b/python/sandcrawler/ingest_html.py
@@ -9,12 +9,26 @@ import pydantic
import trafilatura
from selectolax.parser import HTMLParser
-from sandcrawler.html_metadata import (BiblioMetadata, html_extract_biblio,
- html_extract_resources, load_adblock_rules)
-from sandcrawler.ia import (CdxApiClient, NoCaptureError, WaybackClient, WaybackContentError,
- cdx_to_dict, fix_transfer_encoding)
-from sandcrawler.misc import (datetime_to_cdx, gen_file_metadata, parse_cdx_datetime,
- url_fuzzy_equal)
+from sandcrawler.html_metadata import (
+ BiblioMetadata,
+ html_extract_biblio,
+ html_extract_resources,
+ load_adblock_rules,
+)
+from sandcrawler.ia import (
+ CdxApiClient,
+ NoCaptureError,
+ WaybackClient,
+ WaybackContentError,
+ cdx_to_dict,
+ fix_transfer_encoding,
+)
+from sandcrawler.misc import (
+ datetime_to_cdx,
+ gen_file_metadata,
+ parse_cdx_datetime,
+ url_fuzzy_equal,
+)
TRAFILATURA_AGENT = f"trafilatura/{trafilatura.__version__}"
@@ -23,7 +37,7 @@ def html_extract_body_teixml(doc: bytes) -> dict:
try:
tei_xml = trafilatura.extract(
doc,
- output_format='xmltei',
+ output_format="xmltei",
include_comments=False,
include_formatting=True,
)
@@ -35,12 +49,11 @@ def html_extract_body_teixml(doc: bytes) -> dict:
if tei_xml:
body_txt = teixml_body_text(tei_xml)
word_count = len(body_txt.split())
- return dict(status="success",
- agent=TRAFILATURA_AGENT,
- tei_xml=tei_xml,
- word_count=word_count)
+ return dict(
+ status="success", agent=TRAFILATURA_AGENT, tei_xml=tei_xml, word_count=word_count
+ )
elif doc.startswith(
- b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd">'
+ b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd">'
):
# hack for firstmonday.org
return html_extract_body_teixml(doc[106:])
@@ -51,7 +64,7 @@ def html_extract_body_teixml(doc: bytes) -> dict:
def teixml_body_text(doc_xml: str) -> str:
ns = {"tei": "http://www.tei-c.org/ns/1.0"}
tree = ET.fromstring(doc_xml)
- body = tree.find('.//tei:body', ns)
+ body = tree.find(".//tei:body", ns)
if body:
return " ".join(body.itertext())
else:
@@ -126,8 +139,9 @@ class HtmlMetaRow(pydantic.BaseModel):
)
-def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient,
- when: Optional[datetime.datetime]) -> List[WebResource]:
+def quick_fetch_html_resources(
+ resources: List[dict], cdx_client: CdxApiClient, when: Optional[datetime.datetime]
+) -> List[WebResource]:
"""
This is the lazy version that just does a CDX lookup for each resource.
@@ -138,12 +152,13 @@ def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient,
full = []
closest = when and datetime_to_cdx(when)
for resource in resources:
- cdx_row = cdx_client.lookup_best(resource['url'], closest=closest)
+ cdx_row = cdx_client.lookup_best(resource["url"], closest=closest)
if not cdx_row:
raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}")
- if cdx_row.url != resource['url'] and not url_fuzzy_equal(cdx_row.url, resource['url']):
- print(f" WARN: CDX fuzzy match: {cdx_row.url} != {resource['url']}",
- file=sys.stderr)
+ if cdx_row.url != resource["url"] and not url_fuzzy_equal(cdx_row.url, resource["url"]):
+ print(
+ f" WARN: CDX fuzzy match: {cdx_row.url} != {resource['url']}", file=sys.stderr
+ )
if not cdx_row.status_code:
# TODO: fall back to a full fetch?
print(" WARN: skipping revisit record", file=sys.stderr)
@@ -158,14 +173,16 @@ def quick_fetch_html_resources(resources: List[dict], cdx_client: CdxApiClient,
status_code=cdx_row.status_code,
size=None,
sha256hex=None,
- resource_type=resource['type'],
- ))
+ resource_type=resource["type"],
+ )
+ )
return full
-def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient,
- when: Optional[datetime.datetime]) -> List[WebResource]:
+def fetch_html_resources(
+ resources: List[dict], wayback_client: WaybackClient, when: Optional[datetime.datetime]
+) -> List[WebResource]:
"""
This is the full version which fetches each resource from wayback/petabox
and calculates additional hashes.
@@ -176,11 +193,11 @@ def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient,
full = []
closest = when and datetime_to_cdx(when)
for resource in resources:
- wayback_resp = wayback_client.lookup_resource(resource['url'], closest=closest)
- if not wayback_resp or wayback_resp.status != 'success':
+ wayback_resp = wayback_client.lookup_resource(resource["url"], closest=closest)
+ if not wayback_resp or wayback_resp.status != "success":
raise NoCaptureError(f"HTML sub-resource not found: {resource['url']}")
file_meta = gen_file_metadata(wayback_resp.body, allow_empty=True)
- if file_meta['sha1hex'] != wayback_resp.cdx.sha1hex:
+ if file_meta["sha1hex"] != wayback_resp.cdx.sha1hex:
raise WaybackContentError(
f"wayback payload sha1hex mismatch: {wayback_resp.cdx.datetime} {wayback_resp.cdx.url}"
)
@@ -189,25 +206,27 @@ def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient,
surt=wayback_resp.cdx.surt,
timestamp=parse_cdx_datetime(wayback_resp.cdx.datetime),
url=wayback_resp.cdx.url,
- sha1hex=file_meta['sha1hex'],
- mimetype=file_meta['mimetype'],
+ sha1hex=file_meta["sha1hex"],
+ mimetype=file_meta["mimetype"],
status_code=wayback_resp.cdx.status_code
or wayback_resp.revisit_cdx.status_code,
- size=file_meta['size_bytes'],
- sha256hex=file_meta['sha256hex'],
- resource_type=resource['type'],
- ))
+ size=file_meta["size_bytes"],
+ sha256hex=file_meta["sha256hex"],
+ resource_type=resource["type"],
+ )
+ )
return full
-def html_guess_platform(url: str, doc: HTMLParser,
- biblio: Optional[BiblioMetadata]) -> Optional[str]:
+def html_guess_platform(
+ url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
+) -> Optional[str]:
generator: Optional[str] = None
generator_elem = doc.css_first("meta[name='generator']")
if generator_elem:
- generator = generator_elem.attrs['content']
+ generator = generator_elem.attrs["content"]
else:
generator_elem = doc.css_first("a[id='developedBy']")
if generator_elem:
@@ -226,7 +245,10 @@ def html_guess_platform(url: str, doc: HTMLParser,
return "ojs"
else:
try:
- if 'powered by <a target="blank" href="http://pkp.sfu.ca/ojs/">PKP OJS</a>' in doc.html:
+ if (
+ 'powered by <a target="blank" href="http://pkp.sfu.ca/ojs/">PKP OJS</a>'
+ in doc.html
+ ):
return "ojs"
if 'Powered by <a target="_blank" href="http://arphahub.com">' in doc.html:
return "arpha"
@@ -236,20 +258,21 @@ def html_guess_platform(url: str, doc: HTMLParser,
pass
icon_elem = doc.css_first("link[type='image/x-icon']")
- if icon_elem and 'href' in icon_elem.attrs:
- if 'journalssystem.com' in icon_elem.attrs['href']:
+ if icon_elem and "href" in icon_elem.attrs:
+ if "journalssystem.com" in icon_elem.attrs["href"]:
return "journalssystem.com"
- elif 'indexcopernicus.com' in icon_elem.attrs['href']:
+ elif "indexcopernicus.com" in icon_elem.attrs["href"]:
return "indexcopernicus"
- if 'scielo' in url:
+ if "scielo" in url:
return "scielo"
return None
-def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata],
- word_count: Optional[int]) -> str:
+def html_guess_scope(
+ url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata], word_count: Optional[int]
+) -> str:
"""
This function tries to guess if an HTML document represents one of:
@@ -275,7 +298,7 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
"""
# assert that this is a real URL
- assert url.count('/') >= 2
+ assert url.count("/") >= 2
# basic paywall and loginwall detection based on URL
if url.endswith("/cookieAbsent"):
@@ -293,7 +316,7 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
return "blocked-captcha"
# is this the top-level URL of the domain? aka, no path?
- if url.count('/') <= 2 or (url.count('/') == 3) and url.endswith('/'):
+ if url.count("/") <= 2 or (url.count("/") == 3) and url.endswith("/"):
return "homepage-domain"
platform = html_guess_platform(url, doc, biblio)
@@ -340,7 +363,7 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
if word_count is not None:
if word_count < 20:
return "stub"
- elif word_count > 500 and platform in ['wordpress', 'blogger']:
+ elif word_count > 500 and platform in ["wordpress", "blogger"]:
return "article-fulltext"
elif word_count > 1200:
return "article-fulltext"
@@ -348,9 +371,9 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]
return "unknown"
-def run_single(url: str,
- timestamp: Optional[str] = None,
- quick_mode: bool = False) -> IngestWebResult:
+def run_single(
+ url: str, timestamp: Optional[str] = None, quick_mode: bool = False
+) -> IngestWebResult:
adblock = load_adblock_rules()
wayback_client = WaybackClient()
@@ -368,7 +391,7 @@ def run_single(url: str,
file_meta = gen_file_metadata(html_resource.body)
file_meta, html_resource = fix_transfer_encoding(file_meta, html_resource)
- if file_meta['mimetype'] not in ("text/html", "text/xml"):
+ if file_meta["mimetype"] not in ("text/html", "text/xml"):
return IngestWebResult(
status="wrong-mimetype",
hit=False,
@@ -379,8 +402,8 @@ def run_single(url: str,
html_doc = HTMLParser(html_resource.body)
html_biblio = html_extract_biblio(url, html_doc)
html_body = html_extract_body_teixml(html_resource.body)
- html_scope = html_guess_scope(url, html_doc, html_biblio, html_body.get('word_count'))
- if html_scope not in ('article-fulltext', 'unknown'):
+ html_scope = html_guess_scope(url, html_doc, html_biblio, html_body.get("word_count"))
+ if html_scope not in ("article-fulltext", "unknown"):
return IngestWebResult(
status="wrong-scope",
hit=False,
@@ -397,8 +420,9 @@ def run_single(url: str,
full_resources: List[WebResource] = []
if quick_mode:
- full_resources = quick_fetch_html_resources(raw_resources, wayback_client.cdx_client,
- when)
+ full_resources = quick_fetch_html_resources(
+ raw_resources, wayback_client.cdx_client, when
+ )
else:
full_resources = fetch_html_resources(raw_resources, wayback_client, when)
@@ -425,8 +449,9 @@ def main() -> None:
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
subparsers = parser.add_subparsers()
- sub = subparsers.add_parser("single",
- help="tries to ingest a single URL, dumps result to stdout")
+ sub = subparsers.add_parser(
+ "single", help="tries to ingest a single URL, dumps result to stdout"
+ )
sub.set_defaults(func="run_single")
sub.add_argument(
"url",
@@ -453,8 +478,8 @@ def main() -> None:
result = run_single(args.url, args.timestamp, args.quick_mode)
print(result.json(indent=2, exclude_none=True))
else:
- #func = getattr(wp, args.func)
- #func()
+ # func = getattr(wp, args.func)
+ # func()
raise NotImplementedError()