From d8f82f5836004d394a419574c50f0636369c94d7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 24 Oct 2022 14:22:39 -0700 Subject: ingest: more generic PDF fulltext URL patterns --- python/sandcrawler/html_metadata.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'python') diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 04a2f4a..9773615 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -725,6 +725,20 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ "technique": "worldscientific reader", "example_page": "https://www.worldscientific.com/doi/epdf/10.1142/S0116110521500098", }, + { + "in_fulltext_url": "pdf", + "selector": ".container .view-content .download-article a", + "attr": "href", + "technique": "generic download article button", + "example_page": "https://science.lpnu.ua/mmc/all-volumes-and-issues/volume-9-number-1-2022/pursuit-differential-game-many-pursuers-and-one", + }, + { + "in_fulltext_url": "pdf", + "selector": "body a.download-pdf", + "attr": "href", + "technique": "generic download article button", + "example_page": "https://plit-periodical.com.ua/arhiv/struktura-ta-vlastyvosti-materialu-zrazkiv-vyroshchenyh-metodom-selektyvnogo-lazernogo", + }, ] FULLTEXT_URL_PATTERNS_SKIP: List[str] = [ -- cgit v1.2.3