From 073e9ebb48ef296d0e73162938f07e7f7f44e0d2 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 13 Jan 2022 15:38:44 -0800 Subject: ingest: PDF pattern for integrityresjournals.org --- python/sandcrawler/html_metadata.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index c46788e..37b8e89 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -597,6 +597,14 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ "technique": "PDF Download link (journals.uchicago.edu)", "example_page": "https://www.journals.uchicago.edu/doi/10.14318/hau1.1.008", }, + { + "in_doc_url": "integrityresjournals.org", + "in_fulltext_url": "/article-full-text-pdf/", + "selector": "a[target='_blank'].btn-danger", + "attr": "href", + "technique": "PDF Download link (integrityresjournals.org)", + "example_page": "https://integrityresjournals.org/journal/JBBD/article-abstract/750B649A1", + }, ] FULLTEXT_URL_PATTERNS_SKIP: List[str] = [ -- cgit v1.2.3