html: some refactoring

author: Bryan Newbold <bnewbold@archive.org> 2020-11-03 16:23:34 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2020-11-03 16:23:34 -0800
commit: c145488142d4b5413323322dfc1422efdece83f7 (patch)
tree: 0db6e9b872bea47f15f63242e4bbf897c1356c61
parent: 0b3a9118d7aa9fc3540f8d8f7c367a4c6a856ecf (diff)
download: sandcrawler-c145488142d4b5413323322dfc1422efdece83f7.tar.gz
sandcrawler-c145488142d4b5413323322dfc1422efdece83f7.zip
3 files changed, 57 insertions, 24 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py
index 823218b..fe883ba 100644
--- a/python/sandcrawler/html_ingest.py
+++ b/python/sandcrawler/html_ingest.py
@@ -16,19 +16,21 @@ from sandcrawler.misc import gen_file_metadata, parse_cdx_datetime, datetime_to_
 from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules
 
 
-def html_extract_fulltext_teixml(doc: bytes) -> dict:
+TRAFILATURA_AGENT = f"trafilatura/{trafilatura.__version__}"
+
+def html_extract_body_teixml(doc: bytes) -> dict:
     tei_xml = trafilatura.extract(doc,
         tei_output=True,
         include_comments=False,
         include_formatting=True,
     )
     if tei_xml:
-        return dict(status="success", tei_xml=tei_xml)
+        return dict(status="success", agent=TRAFILATURA_AGENT, tei_xml=tei_xml)
     elif doc.startswith(b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd">'):
         # hack for firstmonday.org
-        return html_extract_fulltext_teixml(doc[106:])
+        return html_extract_body_teixml(doc[106:])
     else:
-        return dict(status="empty-xml")
+        return dict(status="empty-xml", agent=TRAFILATURA_AGENT)
 
 def teixml_body_text(doc_xml: str) -> str:
     ns = {"tei": "http://www.tei-c.org/ns/1.0"}
@@ -58,14 +60,15 @@ class WebResource(pydantic.BaseModel):
 class IngestWebResult(pydantic.BaseModel):
     status: str
     hit: bool
+    error_message: Optional[str]
     cdx: Optional[dict]
     terminal: Optional[Any] # TODO
     request: Optional[Any]  # TODO
     file_meta: Optional[dict]
     html_biblio: Optional[BiblioMetadata]
-    html_scope: Optional[str]
-    html_fulltext: Optional[dict]
-    subresources: Optional[List[WebResource]]
+    scope: Optional[str]
+    html_body: Optional[dict]
+    html_resources: Optional[List[WebResource]]
 
     class Config:
         arbitrary_types_allowed = True
@@ -228,8 +231,8 @@ def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = Fal
 
     html_doc = HTMLParser(html_resource.body)
     html_biblio = html_extract_biblio(url, html_doc)
-    html_fulltext = html_extract_fulltext_teixml(html_resource.body)
-    html_scope = html_guess_scope(url, html_doc, html_biblio, html_fulltext.get('tei_xml'))
+    html_body = html_extract_body_teixml(html_resource.body)
+    html_scope = html_guess_scope(url, html_doc, html_biblio, html_body.get('tei_xml'))
     if html_scope not in ('article-fulltext', 'unknown'):
         return IngestWebResult(
             status="wrong-scope",
@@ -237,7 +240,7 @@ def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = Fal
             cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx),
             file_meta=file_meta,
             html_biblio=html_biblio,
-            html_scope=html_scope,
+            scope=html_scope,
         )
 
     raw_resources = html_extract_resources(html_resource.terminal_url, html_doc, adblock)
@@ -256,10 +259,10 @@ def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = Fal
         hit=True,
         cdx=html_resource.cdx and cdx_to_dict(html_resource.cdx),
         file_meta=file_meta,
-        html_fulltext=html_fulltext,
+        html_body=html_body,
         html_biblio=html_biblio,
-        html_scope=html_scope,
-        subresources=full_resources,
+        scope=html_scope,
+        html_resources=full_resources,
     )
     return output
 
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 41157e0..b23118b 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -1,6 +1,6 @@
 
 import datetime
-from typing import List, Optional, Any
+from typing import List, Optional, Any, Tuple, Dict
 import urllib.parse
 
 import dateparser
@@ -158,9 +158,6 @@ HEAD_META_PATTERNS: Any = {
         "meta[name='citation_fulltext_html_url']",
         "meta[name='bepress_citation_fulltext_html_url']",
     ],
-    "xml_fulltext_url": [
-        "meta[name='citation_xml_url']",
-    ],
     "pdf_fulltext_url": [
         "meta[name='citation_pdf_url']",
         "meta[name='bepress_citation_pdf_url']",
@@ -188,6 +185,19 @@ HEAD_META_LIST_PATTERNS: Any = {
     ],
 }
 
+XML_FULLTEXT_PATTERNS: List[dict] = [
+    {
+        "selector": "meta[name='citation_xml_url']",
+        "attr": "content",
+        "why": "citation_xml_url",
+    },
+    {
+        "selector": "link[rel='alternate'][type='application/xml']",
+        "attr": "href",
+        "why": "alternate link",
+    },
+]
+
 RELEASE_TYPE_MAP = {
     "research article": "article-journal",
     "text.serial.journal": "article-journal",
@@ -232,6 +242,27 @@ class BiblioMetadata(pydantic.BaseModel):
     xml_fulltext_url: Optional[str]
 
 
+def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, patterns: List[dict]) -> Optional[Tuple[str, str]]:
+    """
+    Tries to quickly extract fulltext URLs using a set of patterns. This
+    function is intendend to be generic across various extraction techniques.
+
+    Returns null or a tuple of (url, why)
+    """
+    for pattern in patterns:
+        if not 'selector' in pattern:
+            continue
+        elem = doc.css_first(pattern['selector'])
+        if not elem:
+            continue
+        if 'attr' in pattern:
+            val = elem.attrs[pattern['attr']]
+            if val:
+                val = urllib.parse.urljoin(doc_url, val)
+                assert val
+                return (val, pattern.get('why', 'unknown'))
+    return None
+
 def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadata]:
 
     meta: Any = dict()
@@ -258,11 +289,10 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat
                         meta[field].append(val.attrs['content'])
                 break
 
-    # non-<meta> lookups
-    if not meta.get('xml_fulltext_url'):
-        val = head.css_first("link[rel='alternate'][type='application/xml']")
-        if val and val.attrs['href']:
-            meta['xml_fulltext_url'] = val.attrs['href']
+    # (some) fulltext extractions
+    xml_fulltext_url = html_extract_fulltext_url(doc_url, doc, XML_FULLTEXT_PATTERNS)
+    if xml_fulltext_url:
+        meta['xml_fulltext_url'] = xml_fulltext_url[0]
 
     # TODO: replace with clean_doi() et al
     if meta.get('doi') and meta.get('doi').startswith('doi:'):
@@ -293,7 +323,7 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat
             meta['release_type'] = release_type
 
     # resolve relative URLs
-    for key in ('pdf_fulltext_url', 'html_fulltext_url', 'xml_fulltext_url'):
+    for key in ('pdf_fulltext_url', 'html_fulltext_url'):
         if meta.get(key):
             meta[key] = urllib.parse.urljoin(doc_url, meta[key])
 
diff --git a/python/tests/test_html_ingest.py b/python/tests/test_html_ingest.py
index 96b3883..e6e48ac 100644
--- a/python/tests/test_html_ingest.py
+++ b/python/tests/test_html_ingest.py
@@ -10,5 +10,5 @@ def test_html_extract_ojs3() -> None:
     with open('tests/files/first_monday_ojs3_fulltext.html', 'rb') as f:
         ojs3_html = f.read()
 
-    fulltext = html_extract_fulltext_teixml(ojs3_html)
+    fulltext = html_extract_body_teixml(ojs3_html)
     assert fulltext['status'] == 'success'
author	Bryan Newbold <bnewbold@archive.org>	2020-11-03 16:23:34 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2020-11-03 16:23:34 -0800
commit	c145488142d4b5413323322dfc1422efdece83f7 (patch)
tree	0db6e9b872bea47f15f63242e4bbf897c1356c61
parent	0b3a9118d7aa9fc3540f8d8f7c367a4c6a856ecf (diff)
download	sandcrawler-c145488142d4b5413323322dfc1422efdece83f7.tar.gz sandcrawler-c145488142d4b5413323322dfc1422efdece83f7.zip