aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-03 17:16:16 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-03 17:16:16 -0800
commit84a44a8d98c2d9872af0962021b19def7173e8ba (patch)
treee3d2f7bcd2db0707548602d45d52d7a16a927d5b /python
parent644c6abdb424a3759e06df6b2541d41fb353e95c (diff)
downloadsandcrawler-84a44a8d98c2d9872af0962021b19def7173e8ba.tar.gz
sandcrawler-84a44a8d98c2d9872af0962021b19def7173e8ba.zip
html: improve XML fulltext extraction for scielo
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/html_metadata.py21
1 files changed, 17 insertions, 4 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index b23118b..3ebba57 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -189,12 +189,19 @@ XML_FULLTEXT_PATTERNS: List[dict] = [
{
"selector": "meta[name='citation_xml_url']",
"attr": "content",
- "why": "citation_xml_url",
+ "technique": "citation_xml_url",
},
{
"selector": "link[rel='alternate'][type='application/xml']",
"attr": "href",
- "why": "alternate link",
+ "technique": "alternate link",
+ },
+ {
+ "in_doc_url": "scielo",
+ "in_fulltext_url": "articleXML",
+ "selector": "a[target='xml']",
+ "attr": "href",
+ "technique": "SciElo XML link",
},
]
@@ -247,11 +254,14 @@ def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, patterns: List[dict
Tries to quickly extract fulltext URLs using a set of patterns. This
function is intendend to be generic across various extraction techniques.
- Returns null or a tuple of (url, why)
+ Returns null or a tuple of (url, technique)
"""
for pattern in patterns:
if not 'selector' in pattern:
continue
+ if 'in_doc_url' in pattern:
+ if not pattern['in_doc_url'] in doc_url:
+ continue
elem = doc.css_first(pattern['selector'])
if not elem:
continue
@@ -260,7 +270,10 @@ def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, patterns: List[dict
if val:
val = urllib.parse.urljoin(doc_url, val)
assert val
- return (val, pattern.get('why', 'unknown'))
+ if 'in_fulltext_url' in pattern:
+ if not pattern['in_fulltext_url'] in val:
+ continue
+ return (val, pattern.get('technique', 'unknown'))
return None
def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadata]: