From 9516fd423d0a0545f8206e0ef42439d0578fa20c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 12 Jul 2022 21:01:20 -0700 Subject: ingest: random site PDF link pattern --- python/sandcrawler/html_metadata.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index edaf89f..7b44bfe 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -639,6 +639,13 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ "attr": "href", "technique": "doaj.org access link", }, + { + "in_doc_url": "/jvi.aspx", + "in_fulltext_url": "download_fulltext", + "selector": "div.siteMainWrapper div.siteArticleShare a[target='_blank'].list-group-item", + "attr": "href", + "technique": "erciyesmedj.com publication system PDF download link", + }, ] FULLTEXT_URL_PATTERNS_SKIP: List[str] = [ -- cgit v1.2.3