aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/html_metadata.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-03 16:23:34 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-03 16:23:34 -0800
commitc145488142d4b5413323322dfc1422efdece83f7 (patch)
tree0db6e9b872bea47f15f63242e4bbf897c1356c61 /python/sandcrawler/html_metadata.py
parent0b3a9118d7aa9fc3540f8d8f7c367a4c6a856ecf (diff)
downloadsandcrawler-c145488142d4b5413323322dfc1422efdece83f7.tar.gz
sandcrawler-c145488142d4b5413323322dfc1422efdece83f7.zip
html: some refactoring
Diffstat (limited to 'python/sandcrawler/html_metadata.py')
-rw-r--r--python/sandcrawler/html_metadata.py50
1 files changed, 40 insertions, 10 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 41157e0..b23118b 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -1,6 +1,6 @@
import datetime
-from typing import List, Optional, Any
+from typing import List, Optional, Any, Tuple, Dict
import urllib.parse
import dateparser
@@ -158,9 +158,6 @@ HEAD_META_PATTERNS: Any = {
"meta[name='citation_fulltext_html_url']",
"meta[name='bepress_citation_fulltext_html_url']",
],
- "xml_fulltext_url": [
- "meta[name='citation_xml_url']",
- ],
"pdf_fulltext_url": [
"meta[name='citation_pdf_url']",
"meta[name='bepress_citation_pdf_url']",
@@ -188,6 +185,19 @@ HEAD_META_LIST_PATTERNS: Any = {
],
}
+XML_FULLTEXT_PATTERNS: List[dict] = [
+ {
+ "selector": "meta[name='citation_xml_url']",
+ "attr": "content",
+ "why": "citation_xml_url",
+ },
+ {
+ "selector": "link[rel='alternate'][type='application/xml']",
+ "attr": "href",
+ "why": "alternate link",
+ },
+]
+
RELEASE_TYPE_MAP = {
"research article": "article-journal",
"text.serial.journal": "article-journal",
@@ -232,6 +242,27 @@ class BiblioMetadata(pydantic.BaseModel):
xml_fulltext_url: Optional[str]
+def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, patterns: List[dict]) -> Optional[Tuple[str, str]]:
+ """
+ Tries to quickly extract fulltext URLs using a set of patterns. This
+ function is intendend to be generic across various extraction techniques.
+
+ Returns null or a tuple of (url, why)
+ """
+ for pattern in patterns:
+ if not 'selector' in pattern:
+ continue
+ elem = doc.css_first(pattern['selector'])
+ if not elem:
+ continue
+ if 'attr' in pattern:
+ val = elem.attrs[pattern['attr']]
+ if val:
+ val = urllib.parse.urljoin(doc_url, val)
+ assert val
+ return (val, pattern.get('why', 'unknown'))
+ return None
+
def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadata]:
meta: Any = dict()
@@ -258,11 +289,10 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat
meta[field].append(val.attrs['content'])
break
- # non-<meta> lookups
- if not meta.get('xml_fulltext_url'):
- val = head.css_first("link[rel='alternate'][type='application/xml']")
- if val and val.attrs['href']:
- meta['xml_fulltext_url'] = val.attrs['href']
+ # (some) fulltext extractions
+ xml_fulltext_url = html_extract_fulltext_url(doc_url, doc, XML_FULLTEXT_PATTERNS)
+ if xml_fulltext_url:
+ meta['xml_fulltext_url'] = xml_fulltext_url[0]
# TODO: replace with clean_doi() et al
if meta.get('doi') and meta.get('doi').startswith('doi:'):
@@ -293,7 +323,7 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat
meta['release_type'] = release_type
# resolve relative URLs
- for key in ('pdf_fulltext_url', 'html_fulltext_url', 'xml_fulltext_url'):
+ for key in ('pdf_fulltext_url', 'html_fulltext_url'):
if meta.get(key):
meta[key] = urllib.parse.urljoin(doc_url, meta[key])