diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-10-30 17:33:37 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-10-30 17:33:37 -0700 |
commit | cefbc6fa46e6586d8735f40b3b5432a759edd5f1 (patch) | |
tree | 8f9d0aaa8ac4ab09a3fa7b8891bede586aa953db /python/sandcrawler/html_metadata.py | |
parent | e61d6e8cc3b6824816a83dff56ffbdbbb6329e57 (diff) | |
download | sandcrawler-cefbc6fa46e6586d8735f40b3b5432a759edd5f1.tar.gz sandcrawler-cefbc6fa46e6586d8735f40b3b5432a759edd5f1.zip |
html: syntax fixes; resolve relative URLs; extract more XML fulltext URLs
Diffstat (limited to 'python/sandcrawler/html_metadata.py')
-rw-r--r-- | python/sandcrawler/html_metadata.py | 17 |
1 files changed, 12 insertions, 5 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index d3ca1b7..41157e0 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -232,11 +232,7 @@ class BiblioMetadata(pydantic.BaseModel): xml_fulltext_url: Optional[str] -def html_extract_biblio(doc: HTMLParser) -> Optional[BiblioMetadata]: - """ - TODO: - - meta dc.identifier: parse DOI - """ +def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadata]: meta: Any = dict() head = doc.css_first("head") @@ -262,6 +258,12 @@ def html_extract_biblio(doc: HTMLParser) -> Optional[BiblioMetadata]: meta[field].append(val.attrs['content']) break + # non-<meta> lookups + if not meta.get('xml_fulltext_url'): + val = head.css_first("link[rel='alternate'][type='application/xml']") + if val and val.attrs['href']: + meta['xml_fulltext_url'] = val.attrs['href'] + # TODO: replace with clean_doi() et al if meta.get('doi') and meta.get('doi').startswith('doi:'): meta['doi'] = meta['doi'][4:] @@ -290,6 +292,11 @@ def html_extract_biblio(doc: HTMLParser) -> Optional[BiblioMetadata]: if release_type: meta['release_type'] = release_type + # resolve relative URLs + for key in ('pdf_fulltext_url', 'html_fulltext_url', 'xml_fulltext_url'): + if meta.get(key): + meta[key] = urllib.parse.urljoin(doc_url, meta[key]) + return BiblioMetadata(**meta) def load_adblock_rules() -> braveblock.Adblocker: |