aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-07-12 21:01:20 -0700
committerBryan Newbold <bnewbold@archive.org>2022-07-12 21:01:20 -0700
commit9516fd423d0a0545f8206e0ef42439d0578fa20c (patch)
tree02696debe498acf0aa89d26ebdada79f7c3dcdea
parent0d5f6468960a7bbf7a499654feaaa5ecab7d9130 (diff)
downloadsandcrawler-9516fd423d0a0545f8206e0ef42439d0578fa20c.tar.gz
sandcrawler-9516fd423d0a0545f8206e0ef42439d0578fa20c.zip
ingest: random site PDF link pattern
-rw-r--r--python/sandcrawler/html_metadata.py7
1 files changed, 7 insertions, 0 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index edaf89f..7b44bfe 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -639,6 +639,13 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
"attr": "href",
"technique": "doaj.org access link",
},
+ {
+ "in_doc_url": "/jvi.aspx",
+ "in_fulltext_url": "download_fulltext",
+ "selector": "div.siteMainWrapper div.siteArticleShare a[target='_blank'].list-group-item",
+ "attr": "href",
+ "technique": "erciyesmedj.com publication system PDF download link",
+ },
]
FULLTEXT_URL_PATTERNS_SKIP: List[str] = [