diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-10-24 14:22:39 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-10-24 14:22:39 -0700 |
commit | d8f82f5836004d394a419574c50f0636369c94d7 (patch) | |
tree | 5c18f0644380b032cb8ddeafc24b6d0ef13aa4d5 | |
parent | 5563cb5121c94efcf1819b915e7e7c602215a6e5 (diff) | |
download | sandcrawler-d8f82f5836004d394a419574c50f0636369c94d7.tar.gz sandcrawler-d8f82f5836004d394a419574c50f0636369c94d7.zip |
ingest: more generic PDF fulltext URL patterns
-rw-r--r-- | python/sandcrawler/html_metadata.py | 14 |
1 files changed, 14 insertions, 0 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 04a2f4a..9773615 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -725,6 +725,20 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ "technique": "worldscientific reader", "example_page": "https://www.worldscientific.com/doi/epdf/10.1142/S0116110521500098", }, + { + "in_fulltext_url": "pdf", + "selector": ".container .view-content .download-article a", + "attr": "href", + "technique": "generic download article button", + "example_page": "https://science.lpnu.ua/mmc/all-volumes-and-issues/volume-9-number-1-2022/pursuit-differential-game-many-pursuers-and-one", + }, + { + "in_fulltext_url": "pdf", + "selector": "body a.download-pdf", + "attr": "href", + "technique": "generic download article button", + "example_page": "https://plit-periodical.com.ua/arhiv/struktura-ta-vlastyvosti-materialu-zrazkiv-vyroshchenyh-metodom-selektyvnogo-lazernogo", + }, ] FULLTEXT_URL_PATTERNS_SKIP: List[str] = [ |