From 2a96e2baeb7d318a4aa2abbda7052757a02f5167 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 13 Jan 2022 18:26:21 -0800 Subject: sandcrawler: additional extracts, mostly OJS --- python/sandcrawler/html_metadata.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 37b8e89..2fb500c 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -328,7 +328,7 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ "in_doc_url": "/article/view/", "selector": "a#pdfDownloadLink", "attr": "href", - "technique": "pdfDownloadLink link", + "technique": "OJS pdfDownloadLink link", "example_page": "http://www.revistas.unam.mx/index.php/rep/article/view/35503/32336", }, { @@ -605,6 +605,28 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ "technique": "PDF Download link (integrityresjournals.org)", "example_page": "https://integrityresjournals.org/journal/JBBD/article-abstract/750B649A1", }, + { + "in_doc_url": "/article/view/", + "in_fulltext_url": "/article/download/", + "selector": "body.pkp_page_article a.download", + "attr": "href", + "technique": "OJS PDF Embed", + "example_page": "https://periodicals.karazin.ua/language_teaching/article/view/12543/11957", + }, + { + "in_doc_url": "/article/view/", + "in_fulltext_url": "/article/", + "selector": "a.pdf", + "attr": "href", + "technique": "OJS PDF link", + }, + { + "in_doc_url": "scitemed.com/article/", + "in_fulltext_url": ".pdf", + "selector": "li.tab_pdf_btn a", + "attr": "href", + "technique": "PDF link (scitemed.com)", + }, ] FULLTEXT_URL_PATTERNS_SKIP: List[str] = [ -- cgit v1.2.3