aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-05-21 17:41:41 -0700
committerBryan Newbold <bnewbold@archive.org>2021-05-21 17:41:41 -0700
commit1263ee33535d232d702324980e7ff69305ed8795 (patch)
treef4ec34e52aec28c42ba432fab2945419a3658d3f /python/sandcrawler
parent071af9a4832dcb24be417de9b658d678056b5bf2 (diff)
downloadsandcrawler-1263ee33535d232d702324980e7ff69305ed8795.tar.gz
sandcrawler-1263ee33535d232d702324980e7ff69305ed8795.zip
ingest PDF extraction updates
Diffstat (limited to 'python/sandcrawler')
-rw-r--r--python/sandcrawler/html.py17
-rw-r--r--python/sandcrawler/html_metadata.py54
-rw-r--r--python/sandcrawler/ingest.py5
3 files changed, 74 insertions, 2 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index d3f5cfe..ca600e4 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -94,6 +94,23 @@ def extract_fulltext_url(html_url, html_body):
url = url.split('?via')[0]
return dict(next_url=url, technique="elsevier-linkinghub")
+ # sciencedirect PDF URL extract
+ # https://www.sciencedirect.com/science/article/pii/S0169204621000670
+ if 'sciencedirect.com/science/article/pii/' in html_url and not html_url.endswith(".pdf"):
+ json_tag = soup.find("script", attrs={"type": "application/json", "data-iso-key": "_0"})
+ url = None
+ try:
+ json_text = json_tag.string
+ json_meta = json.loads(json_text)
+ pdf_meta = json_meta['article']['pdfDownload']['urlMetadata']
+ print(pdf_meta, file=sys.stderr)
+ # https://www.sciencedirect.com/science/article/pii/S0169204621000670/pdfft?md5=c4a83d06b334b627ded74cf9423bfa56&pid=1-s2.0-S0169204621000670-main.pdf
+ url = html_url + pdf_meta['pdfExtension'] + "?md5=" + pdf_meta['queryParams']['md5'] + "&pid=" + pdf_meta['queryParams']['pid']
+ except Exception as e:
+ raise e
+ if url:
+ return dict(pdf_url=url, technique="sciencedirect-munge-json")
+
# sciencedirect PDF bounce page
# https://www.sciencedirect.com/science/article/pii/S2590109519300424/pdfft?md5=854f43a44de186eb58674b8e20631691&pid=1-s2.0-S2590109519300424-main.pdf
if '://www.sciencedirect.com/' in html_url and html_url.endswith(".pdf"):
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 1e58778..c805f0a 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -383,6 +383,60 @@ PDF_FULLTEXT_PATTERNS: List[dict] = [
"technique": "PDF URL link",
"example_page": "https://www.degruyter.com/document/doi/10.1515/zaw-2021-0001/html",
},
+ {
+ "in_doc_url": "repositorio.unicamp.br/handle/",
+ "in_fulltext_url": "/bitstream/",
+ "selector": "table.panel-body a[target='_blank']",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "http://www.repositorio.unicamp.br/handle/REPOSIP/287750",
+ },
+ {
+ "in_doc_url": "dlc.library.columbia.edu/durst/",
+ "selector": "dd.blacklight-lib_non_item_in_context_url_ssm a[href]",
+ "attr": "href",
+ "technique": "Access URL link",
+ "example_page": "https://dlc.library.columbia.edu/durst/cul:18931zcrk9",
+ },
+ {
+ "in_doc_url": "fldeploc.dep.state.fl.us/geodb_query/fgs_doi",
+ "in_fulltext_url": "pdf",
+ "selector": "p a[href]",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "http://fldeploc.dep.state.fl.us/geodb_query/fgs_doi.asp?searchCode=IC29",
+ },
+ {
+ "in_doc_url": "preprints.jmir.org/preprint/",
+ "selector": "a.pdf-download-button",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://preprints.jmir.org/preprint/22556",
+ },
+ {
+ "in_doc_url": "bloomsburycollections.com/",
+ "in_fulltext_url": "pdf",
+ "selector": "li.download-item a[href]",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://www.bloomsburycollections.com/book/the-political-economies-of-media-the-transformation-of-the-global-media-industries/the-political-economies-of-media-and-the-transformation-of-the-global-media-industries",
+ },
+ {
+ "in_doc_url": "emerald.com/insight/content/",
+ "in_fulltext_url": "pdf",
+ "selector": "a.intent_pdf_link",
+ "attr": "href",
+ "technique": "PDF URL link",
+ "example_page": "https://www.emerald.com/insight/content/doi/10.1108/RAMJ-11-2020-0065/full/html",
+ },
+ {
+ "in_doc_url": "ingentaconnect.com/content/",
+ "in_fulltext_url": "pdf",
+ "selector": "a.pdf[data-popup]",
+ "attr": "data-popup",
+ "technique": "PDF URL link",
+ "example_page": "https://www.ingentaconnect.com/content/ista/sst/2021/00000049/00000001/art00007",
+ },
]
FULLTEXT_URL_PATTERNS_SKIP = [
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index eb8e256..b610ab4 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -128,8 +128,9 @@ class IngestFileWorker(SandcrawlerWorker):
self.wall_blocklist = [
# loginwall
"://profile.thieme.de/HTML/sso/ejournals/login.htm",
- "://login.bepress.com/"
- "?SAMLRequest="
+ "://login.bepress.com/",
+ "?SAMLRequest=",
+ "://osapublishing.org/captcha/",
]
# these are special-case web domains for which we want SPN2 to not run