From 7f7b1bd0ff45bcb7968dc3da912c2ff3dd74d07c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 11 Oct 2021 16:34:23 -0700 Subject: pdf ingest: journals.uchicago.edu pattern --- python/sandcrawler/html_metadata.py | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'python/sandcrawler') diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 1a328ef..23bf136 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -583,6 +583,14 @@ PDF_FULLTEXT_PATTERNS: List[dict] = [ "technique": "PDF Download link (e-manuscripta.ch)", "example_page": "https://www.e-manuscripta.ch/zut/doi/10.7891/e-manuscripta-112176", }, + { + "in_doc_url": "journals.uchicago.edu", + "in_fulltext_url": "pdf", + "selector": "nav.article__navbar a.ctrl--pdf", + "attr": "href", + "technique": "PDF Download link (journals.uchicago.edu)", + "example_page": "https://www.journals.uchicago.edu/doi/10.14318/hau1.1.008", + }, ] FULLTEXT_URL_PATTERNS_SKIP = [ -- cgit v1.2.3