From 8f8ce8cc7585ededd2fb6d66abc13134d3182f0d Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 26 Mar 2021 12:10:13 -0700 Subject: ingest: handle current degruyter PDF link pattern --- python/sandcrawler/html_metadata.py | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'python') diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index f9f48a6..1c054fa 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -375,6 +375,14 @@ PDF_FULLTEXT_PATTERNS: List[dict] = [ "technique": "PDF URL link", "example_page": "http://www.bezmialemscience.org/archives/archive-detail/article-preview/editorial/20439", }, + { + "in_doc_url": "degruyter.com/document/", + "in_fulltext_url": "/pdf", + "selector": "a.downloadPdf", + "attr": "href", + "technique": "PDF URL link", + "example_page": "https://www.degruyter.com/document/doi/10.1515/zaw-2021-0001/html", + }, ] FULLTEXT_URL_PATTERNS_SKIP = [ -- cgit v1.2.3