aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/html_metadata.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-05-21 17:41:41 -0700
committerBryan Newbold <bnewbold@archive.org>2021-05-21 17:41:41 -0700
commit1263ee33535d232d702324980e7ff69305ed8795 (patch)
treef4ec34e52aec28c42ba432fab2945419a3658d3f /python/sandcrawler/html_metadata.py
parent071af9a4832dcb24be417de9b658d678056b5bf2 (diff)
downloadsandcrawler-1263ee33535d232d702324980e7ff69305ed8795.tar.gz
sandcrawler-1263ee33535d232d702324980e7ff69305ed8795.zip
ingest PDF extraction updates
Diffstat (limited to 'python/sandcrawler/html_metadata.py')
-rw-r--r--python/sandcrawler/html_metadata.py54
1 files changed, 54 insertions, 0 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 1e58778..c805f0a 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -383,6 +383,60 @@ PDF_FULLTEXT_PATTERNS: List[dict] = [
"technique": "PDF URL link",
"example_page": "https://www.degruyter.com/document/doi/10.1515/zaw-2021-0001/html",
},
+ {
+ "in_doc_url": "repositorio.unicamp.br/handle/",
+ "in_fulltext_url": "/bitstream/",
+ "selector": "table.panel-body a[target='_blank']",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "http://www.repositorio.unicamp.br/handle/REPOSIP/287750",
+ },
+ {
+ "in_doc_url": "dlc.library.columbia.edu/durst/",
+ "selector": "dd.blacklight-lib_non_item_in_context_url_ssm a[href]",
+ "attr": "href",
+ "technique": "Access URL link",
+ "example_page": "https://dlc.library.columbia.edu/durst/cul:18931zcrk9",
+ },
+ {
+ "in_doc_url": "fldeploc.dep.state.fl.us/geodb_query/fgs_doi",
+ "in_fulltext_url": "pdf",
+ "selector": "p a[href]",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "http://fldeploc.dep.state.fl.us/geodb_query/fgs_doi.asp?searchCode=IC29",
+ },
+ {
+ "in_doc_url": "preprints.jmir.org/preprint/",
+ "selector": "a.pdf-download-button",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://preprints.jmir.org/preprint/22556",
+ },
+ {
+ "in_doc_url": "bloomsburycollections.com/",
+ "in_fulltext_url": "pdf",
+ "selector": "li.download-item a[href]",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://www.bloomsburycollections.com/book/the-political-economies-of-media-the-transformation-of-the-global-media-industries/the-political-economies-of-media-and-the-transformation-of-the-global-media-industries",
+ },
+ {
+ "in_doc_url": "emerald.com/insight/content/",
+ "in_fulltext_url": "pdf",
+ "selector": "a.intent_pdf_link",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://www.emerald.com/insight/content/doi/10.1108/RAMJ-11-2020-0065/full/html",
+ },
+ {
+ "in_doc_url": "ingentaconnect.com/content/",
+ "in_fulltext_url": "pdf",
+ "selector": "a.pdf[data-popup]",
+ "attr": "data-popup",
+ "technique": "PDF URL link",
+ "example_page": "https://www.ingentaconnect.com/content/ista/sst/2021/00000049/00000001/art00007",
+ },
]
FULLTEXT_URL_PATTERNS_SKIP = [