diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-01-13 18:26:21 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-01-13 18:26:21 -0800 |
commit | 2a96e2baeb7d318a4aa2abbda7052757a02f5167 (patch) | |
tree | 116a819ce8060f1dbfe617f5b2ae780f1be42948 /python | |
parent | 23c560af176a8c2e15c20ddcac78fb3eb736d19d (diff) | |
download | sandcrawler-2a96e2baeb7d318a4aa2abbda7052757a02f5167.tar.gz sandcrawler-2a96e2baeb7d318a4aa2abbda7052757a02f5167.zip |
sandcrawler: additional extracts, mostly OJS
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/html_metadata.py | 24 |
1 files changed, 23 insertions, 1 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 37b8e89..2fb500c 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -328,7 +328,7 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ "in_doc_url": "/article/view/", "selector": "a#pdfDownloadLink", "attr": "href", - "technique": "pdfDownloadLink link", + "technique": "OJS pdfDownloadLink link", "example_page": "http://www.revistas.unam.mx/index.php/rep/article/view/35503/32336", }, { @@ -605,6 +605,28 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ "technique": "PDF Download link (integrityresjournals.org)", "example_page": "https://integrityresjournals.org/journal/JBBD/article-abstract/750B649A1", }, + { + "in_doc_url": "/article/view/", + "in_fulltext_url": "/article/download/", + "selector": "body.pkp_page_article a.download", + "attr": "href", + "technique": "OJS PDF Embed", + "example_page": "https://periodicals.karazin.ua/language_teaching/article/view/12543/11957", + }, + { + "in_doc_url": "/article/view/", + "in_fulltext_url": "/article/", + "selector": "a.pdf", + "attr": "href", + "technique": "OJS PDF link", + }, + { + "in_doc_url": "scitemed.com/article/", + "in_fulltext_url": ".pdf", + "selector": "li.tab_pdf_btn a", + "attr": "href", + "technique": "PDF link (scitemed.com)", + }, ] FULLTEXT_URL_PATTERNS_SKIP: List[str] = [ |