html: syntax fixes; resolve relative URLs; extract more XML fulltext URLs

author: Bryan Newbold <bnewbold@archive.org> 2020-10-30 17:33:37 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-10-30 17:33:37 -0700
commit: cefbc6fa46e6586d8735f40b3b5432a759edd5f1 (patch)
tree: 8f9d0aaa8ac4ab09a3fa7b8891bede586aa953db /python/sandcrawler/html_ingest.py
parent: e61d6e8cc3b6824816a83dff56ffbdbbb6329e57 (diff)
download: sandcrawler-cefbc6fa46e6586d8735f40b3b5432a759edd5f1.tar.gz
sandcrawler-cefbc6fa46e6586d8735f40b3b5432a759edd5f1.zip
1 files changed, 3 insertions, 3 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py
index acd336e..284461e 100644
--- a/python/sandcrawler/html_ingest.py
+++ b/python/sandcrawler/html_ingest.py
@@ -6,7 +6,7 @@ import json
 import datetime
 import argparse
 import xml.etree.ElementTree as ET
-from typing import List, Optional, Any
+from typing import List, Optional, Any, Tuple
 
 import trafilatura
 import pydantic
@@ -75,7 +75,7 @@ class IngestWebResult(pydantic.BaseModel):
         }
 
 
-def fix_transfer_encoding(file_meta: dict, resource: ResourceResult) -> (dict, ResourceResult):
+def fix_transfer_encoding(file_meta: dict, resource: ResourceResult) -> Tuple[dict, ResourceResult]:
     if file_meta['mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip':
         print("transfer encoding not stripped: {}".format(resource.cdx.mimetype), file=sys.stderr)
         inner_body = gzip.decompress(resource.body)
@@ -233,7 +233,7 @@ def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = Fal
         )
 
     html_doc = HTMLParser(html_resource.body)
-    html_biblio = html_extract_biblio(html_doc)
+    html_biblio = html_extract_biblio(url, html_doc)
     html_fulltext = html_extract_fulltext_teixml(html_resource.body)
     html_scope = html_guess_scope(url, html_doc, html_biblio, html_fulltext.get('tei_xml'))
     if html_scope not in ('article-fulltext', 'unknown'):
author	Bryan Newbold <bnewbold@archive.org>	2020-10-30 17:33:37 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-10-30 17:33:37 -0700
commit	cefbc6fa46e6586d8735f40b3b5432a759edd5f1 (patch)
tree	8f9d0aaa8ac4ab09a3fa7b8891bede586aa953db /python/sandcrawler/html_ingest.py
parent	e61d6e8cc3b6824816a83dff56ffbdbbb6329e57 (diff)
download	sandcrawler-cefbc6fa46e6586d8735f40b3b5432a759edd5f1.tar.gz sandcrawler-cefbc6fa46e6586d8735f40b3b5432a759edd5f1.zip