aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-10-24 10:19:46 -0700
committerBryan Newbold <bnewbold@archive.org>2022-10-24 10:19:46 -0700
commit855153ae4fe03656adde16c56a4347f4b3d26487 (patch)
tree9cac24dbc38552c3ac14d88265f4bbc36e3475c1 /python/sandcrawler
parent97aa00038fbce39097d5f78b8891bbb88b71af75 (diff)
downloadsandcrawler-855153ae4fe03656adde16c56a4347f4b3d26487.tar.gz
sandcrawler-855153ae4fe03656adde16c56a4347f4b3d26487.zip
html: worldscientific PDF URL extraction
Diffstat (limited to 'python/sandcrawler')
-rw-r--r--python/sandcrawler/html_metadata.py16
1 files changed, 16 insertions, 0 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 3d9e8ca..04a2f4a 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -709,6 +709,22 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
"technique": "OJS remote_pdf link",
"example_page": "https://www.mediterranea-comunicacion.org/article/view/22240",
},
+ {
+ "in_doc_url": "worldscientific.com/doi/abs/",
+ "in_fulltext_url": "/reader/",
+ "selector": "article.container .single__download a",
+ "attr": "href",
+ "technique": "worldscientific landing pages",
+ "example_page": "https://www.worldscientific.com/doi/abs/10.1142/S0116110521500098",
+ },
+ {
+ "in_doc_url": "worldscientific.com/doi/",
+ "in_fulltext_url": "/pdf/",
+ "selector": "noscript a[target='_blank']",
+ "attr": "href",
+ "technique": "worldscientific reader",
+ "example_page": "https://www.worldscientific.com/doi/epdf/10.1142/S0116110521500098",
+ },
]
FULLTEXT_URL_PATTERNS_SKIP: List[str] = [