From 1a8601bdc36640894d1c34f5c92bc2eda5771bca Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 8 Nov 2020 18:02:51 -0800 Subject: html: more extraction patterns; bugfix; skip more crossmark --- python/sandcrawler/html_metadata.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index cd49a05..eb89a01 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -183,11 +183,21 @@ XML_FULLTEXT_PATTERNS: List[dict] = [ "attr": "content", "technique": "citation_xml_url", }, + { + "selector": "meta[name='fulltext_xml']", + "attr": "content", + "technique": "fulltext_xml", + }, { "selector": "link[rel='alternate'][type='application/xml']", "attr": "href", "technique": "alternate link", }, + { + "selector": "link[rel='alternate'][type='text/xml']", + "attr": "href", + "technique": "alternate link", + }, { "in_doc_url": "scielo", "in_fulltext_url": "articleXML", @@ -210,6 +220,11 @@ HTML_FULLTEXT_PATTERNS: List[dict] = [ "attr": "content", "technique": "citation_fulltext_html_url", }, + { + "selector": "link[rel='alternate'][type='text/html']", + "attr": "href", + "technique": "alternate link", + }, { "in_doc_url": "/article/view/", "in_fulltext_url": "inline=1", @@ -217,6 +232,13 @@ HTML_FULLTEXT_PATTERNS: List[dict] = [ "attr": "src", "technique": "OJS HTML iframe", }, + { + "in_doc_url": "dovepress.com", + "in_fulltext_url": "-fulltext-", + "selector": "a[id='view-full-text']", + "attr": "href", + "technique": "dovepress fulltext link", + }, ] PDF_FULLTEXT_PATTERNS: List[dict] = [ @@ -319,7 +341,7 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat for pattern in patterns: val = head.css_first(pattern) #print((field, pattern, val)) - if val and val.attrs['content']: + if val and val.attrs.get('content'): meta[field] = val.attrs['content'] break @@ -389,6 +411,7 @@ def load_adblock_rules() -> braveblock.Adblocker: "||fonts.googleapis.com^", "||widgets.figshare.com^", "||crossmark-cdn.crossref.org^", + "||crossmark.crossref.org^", "||platform.twitter.com^", "||verify.nature.com^", "||s7.addthis.com^", -- cgit v1.2.3