From c145488142d4b5413323322dfc1422efdece83f7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 3 Nov 2020 16:23:34 -0800 Subject: html: some refactoring --- python/sandcrawler/html_ingest.py | 29 +++++++++++---------- python/sandcrawler/html_metadata.py | 50 +++++++++++++++++++++++++++++-------- python/tests/test_html_ingest.py | 2 +- 3 files changed, 57 insertions(+), 24 deletions(-) (limited to 'python') diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py index 823218b..fe883ba 100644 --- a/python/sandcrawler/html_ingest.py +++ b/python/sandcrawler/html_ingest.py @@ -16,19 +16,21 @@ from sandcrawler.misc import gen_file_metadata, parse_cdx_datetime, datetime_to_ from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules -def html_extract_fulltext_teixml(doc: bytes) -> dict: +TRAFILATURA_AGENT = f"trafilatura/{trafilatura.__version__}" + +def html_extract_body_teixml(doc: bytes) -> dict: tei_xml = trafilatura.extract(doc, tei_output=True, include_comments=False, include_formatting=True, ) if tei_xml: - return dict(status="success", tei_xml=tei_xml) + return dict(status="success", agent=TRAFILATURA_AGENT, tei_xml=tei_xml) elif doc.startswith(b''): # hack for firstmonday.org - return html_extract_fulltext_teixml(doc[106:]) + return html_extract_body_teixml(doc[106:]) else: - return dict(status="empty-xml") + return dict(status="empty-xml", agent=TRAFILATURA_AGENT) def teixml_body_text(doc_xml: str) -> str: ns = {"tei": "http://www.tei-c.org/ns/1.0"} @@ -58,14 +60,15 @@ class WebResource(pydantic.BaseModel): class IngestWebResult(pydantic.BaseModel): status: str hit: bool + error_message: Optional[str] cdx: Optional[dict] terminal: Optional[Any] # TODO request: Optional[Any] # TODO file_meta: Optional[dict] html_biblio: Optional[BiblioMetadata] - html_scope: Optional[str] - html_fulltext: Optional[dict] - subresources: Optional[List[WebResource]] + scope: Optional[str] + html_body: Optional[dict] + html_resources: Optional[List[WebResource]] class Config: arbitrary_types_allowed = True @@ -228,8 +231,8 @@ def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = Fal html_doc = HTMLParser(html_resource.body) html_biblio = html_extract_biblio(url, html_doc) - html_fulltext = html_extract_fulltext_teixml(html_resource.body) - html_scope = html_guess_scope(url, html_doc, html_biblio, html_fulltext.get('tei_xml')) + html_body = html_extract_body_teixml(html_resource.body) + html_scope = html_guess_scope(url, html_doc, html_biblio, html_body.get('tei_xml')) if html_scope not in ('article-fulltext', 'unknown'): return IngestWebResult( status="wrong-scope", @@ -237,7 +240,7 @@ def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = Fal cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx), file_meta=file_meta, html_biblio=html_biblio, - html_scope=html_scope, + scope=html_scope, ) raw_resources = html_extract_resources(html_resource.terminal_url, html_doc, adblock) @@ -256,10 +259,10 @@ def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = Fal hit=True, cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx), file_meta=file_meta, - html_fulltext=html_fulltext, + html_body=html_body, html_biblio=html_biblio, - html_scope=html_scope, - subresources=full_resources, + scope=html_scope, + html_resources=full_resources, ) return output diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 41157e0..b23118b 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -1,6 +1,6 @@ import datetime -from typing import List, Optional, Any +from typing import List, Optional, Any, Tuple, Dict import urllib.parse import dateparser @@ -158,9 +158,6 @@ HEAD_META_PATTERNS: Any = { "meta[name='citation_fulltext_html_url']", "meta[name='bepress_citation_fulltext_html_url']", ], - "xml_fulltext_url": [ - "meta[name='citation_xml_url']", - ], "pdf_fulltext_url": [ "meta[name='citation_pdf_url']", "meta[name='bepress_citation_pdf_url']", @@ -188,6 +185,19 @@ HEAD_META_LIST_PATTERNS: Any = { ], } +XML_FULLTEXT_PATTERNS: List[dict] = [ + { + "selector": "meta[name='citation_xml_url']", + "attr": "content", + "why": "citation_xml_url", + }, + { + "selector": "link[rel='alternate'][type='application/xml']", + "attr": "href", + "why": "alternate link", + }, +] + RELEASE_TYPE_MAP = { "research article": "article-journal", "text.serial.journal": "article-journal", @@ -232,6 +242,27 @@ class BiblioMetadata(pydantic.BaseModel): xml_fulltext_url: Optional[str] +def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, patterns: List[dict]) -> Optional[Tuple[str, str]]: + """ + Tries to quickly extract fulltext URLs using a set of patterns. This + function is intendend to be generic across various extraction techniques. + + Returns null or a tuple of (url, why) + """ + for pattern in patterns: + if not 'selector' in pattern: + continue + elem = doc.css_first(pattern['selector']) + if not elem: + continue + if 'attr' in pattern: + val = elem.attrs[pattern['attr']] + if val: + val = urllib.parse.urljoin(doc_url, val) + assert val + return (val, pattern.get('why', 'unknown')) + return None + def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadata]: meta: Any = dict() @@ -258,11 +289,10 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat meta[field].append(val.attrs['content']) break - # non- lookups - if not meta.get('xml_fulltext_url'): - val = head.css_first("link[rel='alternate'][type='application/xml']") - if val and val.attrs['href']: - meta['xml_fulltext_url'] = val.attrs['href'] + # (some) fulltext extractions + xml_fulltext_url = html_extract_fulltext_url(doc_url, doc, XML_FULLTEXT_PATTERNS) + if xml_fulltext_url: + meta['xml_fulltext_url'] = xml_fulltext_url[0] # TODO: replace with clean_doi() et al if meta.get('doi') and meta.get('doi').startswith('doi:'): @@ -293,7 +323,7 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat meta['release_type'] = release_type # resolve relative URLs - for key in ('pdf_fulltext_url', 'html_fulltext_url', 'xml_fulltext_url'): + for key in ('pdf_fulltext_url', 'html_fulltext_url'): if meta.get(key): meta[key] = urllib.parse.urljoin(doc_url, meta[key]) diff --git a/python/tests/test_html_ingest.py b/python/tests/test_html_ingest.py index 96b3883..e6e48ac 100644 --- a/python/tests/test_html_ingest.py +++ b/python/tests/test_html_ingest.py @@ -10,5 +10,5 @@ def test_html_extract_ojs3() -> None: with open('tests/files/first_monday_ojs3_fulltext.html', 'rb') as f: ojs3_html = f.read() - fulltext = html_extract_fulltext_teixml(ojs3_html) + fulltext = html_extract_body_teixml(ojs3_html) assert fulltext['status'] == 'success' -- cgit v1.2.3