diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-07-12 21:01:20 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-07-12 21:01:20 -0700 |
commit | 9516fd423d0a0545f8206e0ef42439d0578fa20c (patch) | |
tree | 02696debe498acf0aa89d26ebdada79f7c3dcdea /python | |
parent | 0d5f6468960a7bbf7a499654feaaa5ecab7d9130 (diff) | |
download | sandcrawler-9516fd423d0a0545f8206e0ef42439d0578fa20c.tar.gz sandcrawler-9516fd423d0a0545f8206e0ef42439d0578fa20c.zip |
ingest: random site PDF link pattern
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/html_metadata.py | 7 |
1 files changed, 7 insertions, 0 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index edaf89f..7b44bfe 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -639,6 +639,13 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ "attr": "href", "technique": "doaj.org access link", }, + { + "in_doc_url": "/jvi.aspx", + "in_fulltext_url": "download_fulltext", + "selector": "div.siteMainWrapper div.siteArticleShare a[target='_blank'].list-group-item", + "attr": "href", + "technique": "erciyesmedj.com publication system PDF download link", + }, ] FULLTEXT_URL_PATTERNS_SKIP: List[str] = [ |