aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-09-03 18:47:12 -0700
committerBryan Newbold <bnewbold@archive.org>2021-09-03 18:47:12 -0700
commit379276a55b14474a0babe829a41f10bf3a89bbe7 (patch)
treedc78c441afc0c449fe30c5bbb9474bf16953eded /python
parent5afd7e43c93617569df103709795f5f7ec95380d (diff)
downloadsandcrawler-379276a55b14474a0babe829a41f10bf3a89bbe7.tar.gz
sandcrawler-379276a55b14474a0babe829a41f10bf3a89bbe7.zip
yet more PDF URL patterns
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/html_metadata.py48
1 files changed, 48 insertions, 0 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 871be32..44576e6 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -222,6 +222,14 @@ XML_FULLTEXT_PATTERNS: List[dict] = [
"technique": "ARPHA XML link",
"example_page": "https://zookeys.pensoft.net/article/26391",
},
+ {
+ "in_doc_url": "frontiersin.org/",
+ "in_fulltext_url": "xml",
+ "selector": "a.download-files-nlm",
+ "attr": "href",
+ "technique": "XML (NLM) download link (frontiersin.org)",
+ "example_page": "https://www.frontiersin.org/articles/10.3389/fnins.2021.722592/full",
+ },
]
HTML_FULLTEXT_PATTERNS: List[dict] = [
@@ -524,6 +532,46 @@ PDF_FULLTEXT_PATTERNS: List[dict] = [
"technique": "Media link (Plone)",
"example_page": "http://xjornadaslc.fahce.unlp.edu.ar/actas/Ramon_Esteban_Chaparro.pdf/view",
},
+ {
+ "in_doc_url": "isca-speech.org/",
+ "in_fulltext_url": "pdf",
+ "selector": ".w3-container a",
+ "attr": "href",
+ "technique": "PDF URL link (isca-speech.org)",
+ "example_page": "https://www.isca-speech.org/archive/interspeech_2006/chitturi06b_interspeech.html",
+ },
+ {
+ "in_doc_url": "://repository.dri.ie/",
+ "in_fulltext_url": "/download",
+ "selector": "#dri_download_assets > div > a",
+ "attr": "href",
+ "technique": "Download link (repository.dri.ie)",
+ "example_page": "https://repository.dri.ie/catalog/qf8621102",
+ },
+ {
+ "in_doc_url": "frontiersin.org/",
+ "in_fulltext_url": "pdf",
+ "selector": "a.download-files-pdf",
+ "attr": "href",
+ "technique": "PDF Download link (frontiersin.org)",
+ "example_page": "https://www.frontiersin.org/articles/10.3389/fnins.2021.722592/full",
+ },
+ {
+ "in_doc_url": "cureus.com/",
+ "in_fulltext_url": "pdf",
+ "selector": ".small-medium-pdf a.pdf-download-button",
+ "attr": "href",
+ "technique": "PDF Download link (cureus.com)",
+ "example_page": "https://www.cureus.com/articles/69542-tramadol-induced-jerks",
+ },
+ {
+ "in_doc_url": "e-manuscripta.ch/",
+ "in_fulltext_url": "pdf",
+ "selector": "#titleinfoPdfDownload a.resourceLink",
+ "attr": "href",
+ "technique": "PDF Download link (e-manuscripta.ch)",
+ "example_page": "https://www.e-manuscripta.ch/zut/doi/10.7891/e-manuscripta-112176",
+ },
]
FULLTEXT_URL_PATTERNS_SKIP = [