aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-08 18:02:51 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-08 18:02:51 -0800
commit1a8601bdc36640894d1c34f5c92bc2eda5771bca (patch)
tree036bfa0cab2f5b06bbe93956a7d8d2b95241d1a9
parentabe36a83d189e13f3fe20519ccc4d90114e71455 (diff)
downloadsandcrawler-1a8601bdc36640894d1c34f5c92bc2eda5771bca.tar.gz
sandcrawler-1a8601bdc36640894d1c34f5c92bc2eda5771bca.zip
html: more extraction patterns; bugfix; skip more crossmark
-rw-r--r--python/sandcrawler/html_metadata.py25
1 files changed, 24 insertions, 1 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index cd49a05..eb89a01 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -184,11 +184,21 @@ XML_FULLTEXT_PATTERNS: List[dict] = [
"technique": "citation_xml_url",
},
{
+ "selector": "meta[name='fulltext_xml']",
+ "attr": "content",
+ "technique": "fulltext_xml",
+ },
+ {
"selector": "link[rel='alternate'][type='application/xml']",
"attr": "href",
"technique": "alternate link",
},
{
+ "selector": "link[rel='alternate'][type='text/xml']",
+ "attr": "href",
+ "technique": "alternate link",
+ },
+ {
"in_doc_url": "scielo",
"in_fulltext_url": "articleXML",
"selector": "a[target='xml']",
@@ -211,12 +221,24 @@ HTML_FULLTEXT_PATTERNS: List[dict] = [
"technique": "citation_fulltext_html_url",
},
{
+ "selector": "link[rel='alternate'][type='text/html']",
+ "attr": "href",
+ "technique": "alternate link",
+ },
+ {
"in_doc_url": "/article/view/",
"in_fulltext_url": "inline=1",
"selector": "iframe[name='htmlFrame']",
"attr": "src",
"technique": "OJS HTML iframe",
},
+ {
+ "in_doc_url": "dovepress.com",
+ "in_fulltext_url": "-fulltext-",
+ "selector": "a[id='view-full-text']",
+ "attr": "href",
+ "technique": "dovepress fulltext link",
+ },
]
PDF_FULLTEXT_PATTERNS: List[dict] = [
@@ -319,7 +341,7 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat
for pattern in patterns:
val = head.css_first(pattern)
#print((field, pattern, val))
- if val and val.attrs['content']:
+ if val and val.attrs.get('content'):
meta[field] = val.attrs['content']
break
@@ -389,6 +411,7 @@ def load_adblock_rules() -> braveblock.Adblocker:
"||fonts.googleapis.com^",
"||widgets.figshare.com^",
"||crossmark-cdn.crossref.org^",
+ "||crossmark.crossref.org^",
"||platform.twitter.com^",
"||verify.nature.com^",
"||s7.addthis.com^",