diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-03 17:16:16 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-03 17:16:16 -0800 |
commit | 84a44a8d98c2d9872af0962021b19def7173e8ba (patch) | |
tree | e3d2f7bcd2db0707548602d45d52d7a16a927d5b | |
parent | 644c6abdb424a3759e06df6b2541d41fb353e95c (diff) | |
download | sandcrawler-84a44a8d98c2d9872af0962021b19def7173e8ba.tar.gz sandcrawler-84a44a8d98c2d9872af0962021b19def7173e8ba.zip |
html: improve XML fulltext extraction for scielo
-rw-r--r-- | python/sandcrawler/html_metadata.py | 21 |
1 files changed, 17 insertions, 4 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index b23118b..3ebba57 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -189,12 +189,19 @@ XML_FULLTEXT_PATTERNS: List[dict] = [ { "selector": "meta[name='citation_xml_url']", "attr": "content", - "why": "citation_xml_url", + "technique": "citation_xml_url", }, { "selector": "link[rel='alternate'][type='application/xml']", "attr": "href", - "why": "alternate link", + "technique": "alternate link", + }, + { + "in_doc_url": "scielo", + "in_fulltext_url": "articleXML", + "selector": "a[target='xml']", + "attr": "href", + "technique": "SciElo XML link", }, ] @@ -247,11 +254,14 @@ def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, patterns: List[dict Tries to quickly extract fulltext URLs using a set of patterns. This function is intendend to be generic across various extraction techniques. - Returns null or a tuple of (url, why) + Returns null or a tuple of (url, technique) """ for pattern in patterns: if not 'selector' in pattern: continue + if 'in_doc_url' in pattern: + if not pattern['in_doc_url'] in doc_url: + continue elem = doc.css_first(pattern['selector']) if not elem: continue @@ -260,7 +270,10 @@ def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, patterns: List[dict if val: val = urllib.parse.urljoin(doc_url, val) assert val - return (val, pattern.get('why', 'unknown')) + if 'in_fulltext_url' in pattern: + if not pattern['in_fulltext_url'] in val: + continue + return (val, pattern.get('technique', 'unknown')) return None def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadata]: |