aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-11 16:34:23 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-11 16:34:23 -0700
commit7f7b1bd0ff45bcb7968dc3da912c2ff3dd74d07c (patch)
tree417722c13d244bccee97d4d8e7fc30c8facf11d1
parent850f97eefa11e52a4a802eaada720f692cda6264 (diff)
downloadsandcrawler-7f7b1bd0ff45bcb7968dc3da912c2ff3dd74d07c.tar.gz
sandcrawler-7f7b1bd0ff45bcb7968dc3da912c2ff3dd74d07c.zip
pdf ingest: journals.uchicago.edu pattern
-rw-r--r--python/sandcrawler/html_metadata.py8
1 files changed, 8 insertions, 0 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 1a328ef..23bf136 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -583,6 +583,14 @@ PDF_FULLTEXT_PATTERNS: List[dict] = [
"technique": "PDF Download link (e-manuscripta.ch)",
"example_page": "https://www.e-manuscripta.ch/zut/doi/10.7891/e-manuscripta-112176",
},
+ {
+ "in_doc_url": "journals.uchicago.edu",
+ "in_fulltext_url": "pdf",
+ "selector": "nav.article__navbar a.ctrl--pdf",
+ "attr": "href",
+ "technique": "PDF Download link (journals.uchicago.edu)",
+ "example_page": "https://www.journals.uchicago.edu/doi/10.14318/hau1.1.008",
+ },
]
FULLTEXT_URL_PATTERNS_SKIP = [