aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-01-13 18:26:21 -0800
committerBryan Newbold <bnewbold@archive.org>2022-01-13 18:26:21 -0800
commit2a96e2baeb7d318a4aa2abbda7052757a02f5167 (patch)
tree116a819ce8060f1dbfe617f5b2ae780f1be42948 /python/sandcrawler
parent23c560af176a8c2e15c20ddcac78fb3eb736d19d (diff)
downloadsandcrawler-2a96e2baeb7d318a4aa2abbda7052757a02f5167.tar.gz
sandcrawler-2a96e2baeb7d318a4aa2abbda7052757a02f5167.zip
sandcrawler: additional extracts, mostly OJS
Diffstat (limited to 'python/sandcrawler')
-rw-r--r--python/sandcrawler/html_metadata.py24
1 files changed, 23 insertions, 1 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 37b8e89..2fb500c 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -328,7 +328,7 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
"in_doc_url": "/article/view/",
"selector": "a#pdfDownloadLink",
"attr": "href",
- "technique": "pdfDownloadLink link",
+ "technique": "OJS pdfDownloadLink link",
"example_page": "http://www.revistas.unam.mx/index.php/rep/article/view/35503/32336",
},
{
@@ -605,6 +605,28 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
"technique": "PDF Download link (integrityresjournals.org)",
"example_page": "https://integrityresjournals.org/journal/JBBD/article-abstract/750B649A1",
},
+ {
+ "in_doc_url": "/article/view/",
+ "in_fulltext_url": "/article/download/",
+ "selector": "body.pkp_page_article a.download",
+ "attr": "href",
+ "technique": "OJS PDF Embed",
+ "example_page": "https://periodicals.karazin.ua/language_teaching/article/view/12543/11957",
+ },
+ {
+ "in_doc_url": "/article/view/",
+ "in_fulltext_url": "/article/",
+ "selector": "a.pdf",
+ "attr": "href",
+ "technique": "OJS PDF link",
+ },
+ {
+ "in_doc_url": "scitemed.com/article/",
+ "in_fulltext_url": ".pdf",
+ "selector": "li.tab_pdf_btn a",
+ "attr": "href",
+ "technique": "PDF link (scitemed.com)",
+ },
]
FULLTEXT_URL_PATTERNS_SKIP: List[str] = [