aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/html_metadata.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-03-26 12:10:13 -0700
committerBryan Newbold <bnewbold@archive.org>2021-03-26 12:10:13 -0700
commit8f8ce8cc7585ededd2fb6d66abc13134d3182f0d (patch)
tree09594036de3dfc95e210e8e3a68982b8593074a8 /python/sandcrawler/html_metadata.py
parent265e34328d2cfa506eeaf0d8c43acefca9757c93 (diff)
downloadsandcrawler-8f8ce8cc7585ededd2fb6d66abc13134d3182f0d.tar.gz
sandcrawler-8f8ce8cc7585ededd2fb6d66abc13134d3182f0d.zip
ingest: handle current degruyter PDF link pattern
Diffstat (limited to 'python/sandcrawler/html_metadata.py')
-rw-r--r--python/sandcrawler/html_metadata.py8
1 files changed, 8 insertions, 0 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index f9f48a6..1c054fa 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -375,6 +375,14 @@ PDF_FULLTEXT_PATTERNS: List[dict] = [
"technique": "PDF URL link",
"example_page": "http://www.bezmialemscience.org/archives/archive-detail/article-preview/editorial/20439",
},
+ {
+ "in_doc_url": "degruyter.com/document/",
+ "in_fulltext_url": "/pdf",
+ "selector": "a.downloadPdf",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://www.degruyter.com/document/doi/10.1515/zaw-2021-0001/html",
+ },
]
FULLTEXT_URL_PATTERNS_SKIP = [