aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-10-24 14:22:39 -0700
committerBryan Newbold <bnewbold@archive.org>2022-10-24 14:22:39 -0700
commitd8f82f5836004d394a419574c50f0636369c94d7 (patch)
tree5c18f0644380b032cb8ddeafc24b6d0ef13aa4d5 /python
parent5563cb5121c94efcf1819b915e7e7c602215a6e5 (diff)
downloadsandcrawler-d8f82f5836004d394a419574c50f0636369c94d7.tar.gz
sandcrawler-d8f82f5836004d394a419574c50f0636369c94d7.zip
ingest: more generic PDF fulltext URL patterns
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/html_metadata.py14
1 files changed, 14 insertions, 0 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 04a2f4a..9773615 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -725,6 +725,20 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
"technique": "worldscientific reader",
"example_page": "https://www.worldscientific.com/doi/epdf/10.1142/S0116110521500098",
},
+ {
+ "in_fulltext_url": "pdf",
+ "selector": ".container .view-content .download-article a",
+ "attr": "href",
+ "technique": "generic download article button",
+ "example_page": "https://science.lpnu.ua/mmc/all-volumes-and-issues/volume-9-number-1-2022/pursuit-differential-game-many-pursuers-and-one",
+ },
+ {
+ "in_fulltext_url": "pdf",
+ "selector": "body a.download-pdf",
+ "attr": "href",
+ "technique": "generic download article button",
+ "example_page": "https://plit-periodical.com.ua/arhiv/struktura-ta-vlastyvosti-materialu-zrazkiv-vyroshchenyh-metodom-selektyvnogo-lazernogo",
+ },
]
FULLTEXT_URL_PATTERNS_SKIP: List[str] = [