html: syntax fixes; resolve relative URLs; extract more XML fulltext URLs

author: Bryan Newbold <bnewbold@archive.org> 2020-10-30 17:33:37 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-10-30 17:33:37 -0700
commit: cefbc6fa46e6586d8735f40b3b5432a759edd5f1 (patch)
tree: 8f9d0aaa8ac4ab09a3fa7b8891bede586aa953db
parent: e61d6e8cc3b6824816a83dff56ffbdbbb6329e57 (diff)
download: sandcrawler-cefbc6fa46e6586d8735f40b3b5432a759edd5f1.tar.gz
sandcrawler-cefbc6fa46e6586d8735f40b3b5432a759edd5f1.zip
3 files changed, 23 insertions, 15 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py
index acd336e..284461e 100644
--- a/python/sandcrawler/html_ingest.py
+++ b/python/sandcrawler/html_ingest.py
@@ -6,7 +6,7 @@ import json
 import datetime
 import argparse
 import xml.etree.ElementTree as ET
-from typing import List, Optional, Any
+from typing import List, Optional, Any, Tuple
 
 import trafilatura
 import pydantic
@@ -75,7 +75,7 @@ class IngestWebResult(pydantic.BaseModel):
         }
 
 
-def fix_transfer_encoding(file_meta: dict, resource: ResourceResult) -> (dict, ResourceResult):
+def fix_transfer_encoding(file_meta: dict, resource: ResourceResult) -> Tuple[dict, ResourceResult]:
     if file_meta['mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip':
         print("transfer encoding not stripped: {}".format(resource.cdx.mimetype), file=sys.stderr)
         inner_body = gzip.decompress(resource.body)
@@ -233,7 +233,7 @@ def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = Fal
         )
 
     html_doc = HTMLParser(html_resource.body)
-    html_biblio = html_extract_biblio(html_doc)
+    html_biblio = html_extract_biblio(url, html_doc)
     html_fulltext = html_extract_fulltext_teixml(html_resource.body)
     html_scope = html_guess_scope(url, html_doc, html_biblio, html_fulltext.get('tei_xml'))
     if html_scope not in ('article-fulltext', 'unknown'):
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index d3ca1b7..41157e0 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -232,11 +232,7 @@ class BiblioMetadata(pydantic.BaseModel):
     xml_fulltext_url: Optional[str]
 
 
-def html_extract_biblio(doc: HTMLParser) -> Optional[BiblioMetadata]:
-    """
-    TODO:
-    - meta dc.identifier: parse DOI
-    """
+def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadata]:
 
     meta: Any = dict()
     head = doc.css_first("head")
@@ -262,6 +258,12 @@ def html_extract_biblio(doc: HTMLParser) -> Optional[BiblioMetadata]:
                         meta[field].append(val.attrs['content'])
                 break
 
+    # non-<meta> lookups
+    if not meta.get('xml_fulltext_url'):
+        val = head.css_first("link[rel='alternate'][type='application/xml']")
+        if val and val.attrs['href']:
+            meta['xml_fulltext_url'] = val.attrs['href']
+
     # TODO: replace with clean_doi() et al
     if meta.get('doi') and meta.get('doi').startswith('doi:'):
         meta['doi'] = meta['doi'][4:]
@@ -290,6 +292,11 @@ def html_extract_biblio(doc: HTMLParser) -> Optional[BiblioMetadata]:
         if release_type:
             meta['release_type'] = release_type
 
+    # resolve relative URLs
+    for key in ('pdf_fulltext_url', 'html_fulltext_url', 'xml_fulltext_url'):
+        if meta.get(key):
+            meta[key] = urllib.parse.urljoin(doc_url, meta[key])
+
     return BiblioMetadata(**meta)
 
 def load_adblock_rules() -> braveblock.Adblocker:
diff --git a/python/tests/test_html_metadata.py b/python/tests/test_html_metadata.py
index 597520c..b428b0d 100644
--- a/python/tests/test_html_metadata.py
+++ b/python/tests/test_html_metadata.py
@@ -10,7 +10,7 @@ def test_html_metadata_plos() -> None:
     with open('tests/files/plos_one_article.html', 'r') as f:
         plos_html = f.read()
 
-    meta = html_extract_biblio(HTMLParser(plos_html))
+    meta = html_extract_biblio("http://example.org", HTMLParser(plos_html))
     assert meta is not None
     assert meta.title == "Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody"
     assert meta.doi == "10.1371/journal.pone.0213978"
@@ -46,7 +46,7 @@ def test_html_metadata_elife() -> None:
     with open('tests/files/elife_article.html', 'r') as f:
         elife_html = f.read()
 
-    meta = html_extract_biblio(HTMLParser(elife_html))
+    meta = html_extract_biblio("http://example.org", HTMLParser(elife_html))
     assert meta is not None
     assert meta.title == "Parallel visual circuitry in a basal chordate"
     assert meta.doi == "10.7554/eLife.44753"
@@ -70,7 +70,7 @@ def test_html_metadata_peerj() -> None:
     with open('tests/files/peerj_oa_article.html', 'r') as f:
         peerj_html = f.read()
 
-    meta = html_extract_biblio(HTMLParser(peerj_html))
+    meta = html_extract_biblio("http://example.org", HTMLParser(peerj_html))
     assert meta is not None
     assert meta.title == "The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles"
     assert meta.doi == "10.7717/peerj.4375"
@@ -88,6 +88,7 @@ def test_html_metadata_peerj() -> None:
     assert meta.container_name == "PeerJ"
     # "2018-02-13"
     assert meta.release_date == datetime.date(year=2018, month=2, day=13)
+    assert meta.xml_fulltext_url and ".xml" in meta.xml_fulltext_url
 
 
 def test_html_metadata_nature() -> None:
@@ -95,7 +96,7 @@ def test_html_metadata_nature() -> None:
     with open('tests/files/nature_article.html', 'r') as f:
         nature_html = f.read()
 
-    meta = html_extract_biblio(HTMLParser(nature_html))
+    meta = html_extract_biblio("http://example.org", HTMLParser(nature_html))
     assert meta is not None
     assert meta.title == "More than 100 scientific journals have disappeared from the Internet"
     assert meta.doi == "10.1038/d41586-020-02610-z"
@@ -115,7 +116,7 @@ def test_html_metadata_ojs3() -> None:
     with open('tests/files/first_monday_ojs3_landingpage.html', 'r') as f:
         ojs3_html = f.read()
 
-    meta = html_extract_biblio(HTMLParser(ojs3_html))
+    meta = html_extract_biblio("http://example.org", HTMLParser(ojs3_html))
     assert meta is not None
     assert meta.title == "Surveillance, stigma & sociotechnical design for HIV"
     assert meta.doi == "10.5210/fm.v25i10.10274"
@@ -140,7 +141,7 @@ def test_html_metadata_dlib() -> None:
     with open('tests/files/dlib_05vanhyning.html', 'r') as f:
         dlib_html = f.read()
 
-    meta = html_extract_biblio(HTMLParser(dlib_html))
+    meta = html_extract_biblio("http://example.org", HTMLParser(dlib_html))
     assert meta is not None
     assert meta.doi == "10.1045/may2017-vanhyning"
     # "2017-05-15"
@@ -159,7 +160,7 @@ def test_html_metadata_dc_case() -> None:
     <body>Hi.</body>
     </html>"""
 
-    meta = html_extract_biblio(HTMLParser(snippet))
+    meta = html_extract_biblio("http://example.org", HTMLParser(snippet))
     assert meta is not None
     assert meta.issue == "123"
author	Bryan Newbold <bnewbold@archive.org>	2020-10-30 17:33:37 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-10-30 17:33:37 -0700
commit	cefbc6fa46e6586d8735f40b3b5432a759edd5f1 (patch)
tree	8f9d0aaa8ac4ab09a3fa7b8891bede586aa953db
parent	e61d6e8cc3b6824816a83dff56ffbdbbb6329e57 (diff)
download	sandcrawler-cefbc6fa46e6586d8735f40b3b5432a759edd5f1.tar.gz sandcrawler-cefbc6fa46e6586d8735f40b3b5432a759edd5f1.zip