diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-08 18:02:51 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-08 18:02:51 -0800 |
commit | 1a8601bdc36640894d1c34f5c92bc2eda5771bca (patch) | |
tree | 036bfa0cab2f5b06bbe93956a7d8d2b95241d1a9 | |
parent | abe36a83d189e13f3fe20519ccc4d90114e71455 (diff) | |
download | sandcrawler-1a8601bdc36640894d1c34f5c92bc2eda5771bca.tar.gz sandcrawler-1a8601bdc36640894d1c34f5c92bc2eda5771bca.zip |
html: more extraction patterns; bugfix; skip more crossmark
-rw-r--r-- | python/sandcrawler/html_metadata.py | 25 |
1 files changed, 24 insertions, 1 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index cd49a05..eb89a01 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -184,11 +184,21 @@ XML_FULLTEXT_PATTERNS: List[dict] = [ "technique": "citation_xml_url", }, { + "selector": "meta[name='fulltext_xml']", + "attr": "content", + "technique": "fulltext_xml", + }, + { "selector": "link[rel='alternate'][type='application/xml']", "attr": "href", "technique": "alternate link", }, { + "selector": "link[rel='alternate'][type='text/xml']", + "attr": "href", + "technique": "alternate link", + }, + { "in_doc_url": "scielo", "in_fulltext_url": "articleXML", "selector": "a[target='xml']", @@ -211,12 +221,24 @@ HTML_FULLTEXT_PATTERNS: List[dict] = [ "technique": "citation_fulltext_html_url", }, { + "selector": "link[rel='alternate'][type='text/html']", + "attr": "href", + "technique": "alternate link", + }, + { "in_doc_url": "/article/view/", "in_fulltext_url": "inline=1", "selector": "iframe[name='htmlFrame']", "attr": "src", "technique": "OJS HTML iframe", }, + { + "in_doc_url": "dovepress.com", + "in_fulltext_url": "-fulltext-", + "selector": "a[id='view-full-text']", + "attr": "href", + "technique": "dovepress fulltext link", + }, ] PDF_FULLTEXT_PATTERNS: List[dict] = [ @@ -319,7 +341,7 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat for pattern in patterns: val = head.css_first(pattern) #print((field, pattern, val)) - if val and val.attrs['content']: + if val and val.attrs.get('content'): meta[field] = val.attrs['content'] break @@ -389,6 +411,7 @@ def load_adblock_rules() -> braveblock.Adblocker: "||fonts.googleapis.com^", "||widgets.figshare.com^", "||crossmark-cdn.crossref.org^", + "||crossmark.crossref.org^", "||platform.twitter.com^", "||verify.nature.com^", "||s7.addthis.com^", |