diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-11 16:34:23 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-11 16:34:23 -0700 |
commit | 7f7b1bd0ff45bcb7968dc3da912c2ff3dd74d07c (patch) | |
tree | 417722c13d244bccee97d4d8e7fc30c8facf11d1 | |
parent | 850f97eefa11e52a4a802eaada720f692cda6264 (diff) | |
download | sandcrawler-7f7b1bd0ff45bcb7968dc3da912c2ff3dd74d07c.tar.gz sandcrawler-7f7b1bd0ff45bcb7968dc3da912c2ff3dd74d07c.zip |
pdf ingest: journals.uchicago.edu pattern
-rw-r--r-- | python/sandcrawler/html_metadata.py | 8 |
1 files changed, 8 insertions, 0 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 1a328ef..23bf136 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -583,6 +583,14 @@ PDF_FULLTEXT_PATTERNS: List[dict] = [ "technique": "PDF Download link (e-manuscripta.ch)", "example_page": "https://www.e-manuscripta.ch/zut/doi/10.7891/e-manuscripta-112176", }, + { + "in_doc_url": "journals.uchicago.edu", + "in_fulltext_url": "pdf", + "selector": "nav.article__navbar a.ctrl--pdf", + "attr": "href", + "technique": "PDF Download link (journals.uchicago.edu)", + "example_page": "https://www.journals.uchicago.edu/doi/10.14318/hau1.1.008", + }, ] FULLTEXT_URL_PATTERNS_SKIP = [ |