aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-10-30 17:33:37 -0700
committerBryan Newbold <bnewbold@archive.org>2020-10-30 17:33:37 -0700
commitcefbc6fa46e6586d8735f40b3b5432a759edd5f1 (patch)
tree8f9d0aaa8ac4ab09a3fa7b8891bede586aa953db /python/sandcrawler
parente61d6e8cc3b6824816a83dff56ffbdbbb6329e57 (diff)
downloadsandcrawler-cefbc6fa46e6586d8735f40b3b5432a759edd5f1.tar.gz
sandcrawler-cefbc6fa46e6586d8735f40b3b5432a759edd5f1.zip
html: syntax fixes; resolve relative URLs; extract more XML fulltext URLs
Diffstat (limited to 'python/sandcrawler')
-rw-r--r--python/sandcrawler/html_ingest.py6
-rw-r--r--python/sandcrawler/html_metadata.py17
2 files changed, 15 insertions, 8 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py
index acd336e..284461e 100644
--- a/python/sandcrawler/html_ingest.py
+++ b/python/sandcrawler/html_ingest.py
@@ -6,7 +6,7 @@ import json
import datetime
import argparse
import xml.etree.ElementTree as ET
-from typing import List, Optional, Any
+from typing import List, Optional, Any, Tuple
import trafilatura
import pydantic
@@ -75,7 +75,7 @@ class IngestWebResult(pydantic.BaseModel):
}
-def fix_transfer_encoding(file_meta: dict, resource: ResourceResult) -> (dict, ResourceResult):
+def fix_transfer_encoding(file_meta: dict, resource: ResourceResult) -> Tuple[dict, ResourceResult]:
if file_meta['mimetype'] == 'application/gzip' and resource.cdx and resource.cdx.mimetype != 'application/gzip':
print("transfer encoding not stripped: {}".format(resource.cdx.mimetype), file=sys.stderr)
inner_body = gzip.decompress(resource.body)
@@ -233,7 +233,7 @@ def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = Fal
)
html_doc = HTMLParser(html_resource.body)
- html_biblio = html_extract_biblio(html_doc)
+ html_biblio = html_extract_biblio(url, html_doc)
html_fulltext = html_extract_fulltext_teixml(html_resource.body)
html_scope = html_guess_scope(url, html_doc, html_biblio, html_fulltext.get('tei_xml'))
if html_scope not in ('article-fulltext', 'unknown'):
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index d3ca1b7..41157e0 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -232,11 +232,7 @@ class BiblioMetadata(pydantic.BaseModel):
xml_fulltext_url: Optional[str]
-def html_extract_biblio(doc: HTMLParser) -> Optional[BiblioMetadata]:
- """
- TODO:
- - meta dc.identifier: parse DOI
- """
+def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadata]:
meta: Any = dict()
head = doc.css_first("head")
@@ -262,6 +258,12 @@ def html_extract_biblio(doc: HTMLParser) -> Optional[BiblioMetadata]:
meta[field].append(val.attrs['content'])
break
+ # non-<meta> lookups
+ if not meta.get('xml_fulltext_url'):
+ val = head.css_first("link[rel='alternate'][type='application/xml']")
+ if val and val.attrs['href']:
+ meta['xml_fulltext_url'] = val.attrs['href']
+
# TODO: replace with clean_doi() et al
if meta.get('doi') and meta.get('doi').startswith('doi:'):
meta['doi'] = meta['doi'][4:]
@@ -290,6 +292,11 @@ def html_extract_biblio(doc: HTMLParser) -> Optional[BiblioMetadata]:
if release_type:
meta['release_type'] = release_type
+ # resolve relative URLs
+ for key in ('pdf_fulltext_url', 'html_fulltext_url', 'xml_fulltext_url'):
+ if meta.get(key):
+ meta[key] = urllib.parse.urljoin(doc_url, meta[key])
+
return BiblioMetadata(**meta)
def load_adblock_rules() -> braveblock.Adblocker: