diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-10-24 10:19:46 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-10-24 10:19:46 -0700 |
commit | 855153ae4fe03656adde16c56a4347f4b3d26487 (patch) | |
tree | 9cac24dbc38552c3ac14d88265f4bbc36e3475c1 /python | |
parent | 97aa00038fbce39097d5f78b8891bbb88b71af75 (diff) | |
download | sandcrawler-855153ae4fe03656adde16c56a4347f4b3d26487.tar.gz sandcrawler-855153ae4fe03656adde16c56a4347f4b3d26487.zip |
html: worldscientific PDF URL extraction
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/html_metadata.py | 16 |
1 files changed, 16 insertions, 0 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 3d9e8ca..04a2f4a 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -709,6 +709,22 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ "technique": "OJS remote_pdf link", "example_page": "https://www.mediterranea-comunicacion.org/article/view/22240", }, + { + "in_doc_url": "worldscientific.com/doi/abs/", + "in_fulltext_url": "/reader/", + "selector": "article.container .single__download a", + "attr": "href", + "technique": "worldscientific landing pages", + "example_page": "https://www.worldscientific.com/doi/abs/10.1142/S0116110521500098", + }, + { + "in_doc_url": "worldscientific.com/doi/", + "in_fulltext_url": "/pdf/", + "selector": "noscript a[target='_blank']", + "attr": "href", + "technique": "worldscientific reader", + "example_page": "https://www.worldscientific.com/doi/epdf/10.1142/S0116110521500098", + }, ] FULLTEXT_URL_PATTERNS_SKIP: List[str] = [ |