aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-01-13 15:38:44 -0800
committerBryan Newbold <bnewbold@archive.org>2022-01-13 15:38:44 -0800
commit073e9ebb48ef296d0e73162938f07e7f7f44e0d2 (patch)
treec24040e84d2d29ea46dcf6963f38e8517710d22a
parent97339426c4d0022c3fdf5948ef94b99bb1e120ee (diff)
downloadsandcrawler-073e9ebb48ef296d0e73162938f07e7f7f44e0d2.tar.gz
sandcrawler-073e9ebb48ef296d0e73162938f07e7f7f44e0d2.zip
ingest: PDF pattern for integrityresjournals.org
-rw-r--r--python/sandcrawler/html_metadata.py8
1 files changed, 8 insertions, 0 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index c46788e..37b8e89 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -597,6 +597,14 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
"technique": "PDF Download link (journals.uchicago.edu)",
"example_page": "https://www.journals.uchicago.edu/doi/10.14318/hau1.1.008",
},
+ {
+ "in_doc_url": "integrityresjournals.org",
+ "in_fulltext_url": "/article-full-text-pdf/",
+ "selector": "a[target='_blank'].btn-danger",
+ "attr": "href",
+ "technique": "PDF Download link (integrityresjournals.org)",
+ "example_page": "https://integrityresjournals.org/journal/JBBD/article-abstract/750B649A1",
+ },
]
FULLTEXT_URL_PATTERNS_SKIP: List[str] = [