From 1263ee33535d232d702324980e7ff69305ed8795 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 21 May 2021 17:41:41 -0700 Subject: ingest PDF extraction updates --- python/sandcrawler/html.py | 17 ++++++++++++ python/sandcrawler/html_metadata.py | 54 +++++++++++++++++++++++++++++++++++++ python/sandcrawler/ingest.py | 5 ++-- 3 files changed, 74 insertions(+), 2 deletions(-) (limited to 'python/sandcrawler') diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index d3f5cfe..ca600e4 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -94,6 +94,23 @@ def extract_fulltext_url(html_url, html_body): url = url.split('?via')[0] return dict(next_url=url, technique="elsevier-linkinghub") + # sciencedirect PDF URL extract + # https://www.sciencedirect.com/science/article/pii/S0169204621000670 + if 'sciencedirect.com/science/article/pii/' in html_url and not html_url.endswith(".pdf"): + json_tag = soup.find("script", attrs={"type": "application/json", "data-iso-key": "_0"}) + url = None + try: + json_text = json_tag.string + json_meta = json.loads(json_text) + pdf_meta = json_meta['article']['pdfDownload']['urlMetadata'] + print(pdf_meta, file=sys.stderr) + # https://www.sciencedirect.com/science/article/pii/S0169204621000670/pdfft?md5=c4a83d06b334b627ded74cf9423bfa56&pid=1-s2.0-S0169204621000670-main.pdf + url = html_url + pdf_meta['pdfExtension'] + "?md5=" + pdf_meta['queryParams']['md5'] + "&pid=" + pdf_meta['queryParams']['pid'] + except Exception as e: + raise e + if url: + return dict(pdf_url=url, technique="sciencedirect-munge-json") + # sciencedirect PDF bounce page # https://www.sciencedirect.com/science/article/pii/S2590109519300424/pdfft?md5=854f43a44de186eb58674b8e20631691&pid=1-s2.0-S2590109519300424-main.pdf if '://www.sciencedirect.com/' in html_url and html_url.endswith(".pdf"): diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 1e58778..c805f0a 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -383,6 +383,60 @@ PDF_FULLTEXT_PATTERNS: List[dict] = [ "technique": "PDF URL link", "example_page": "https://www.degruyter.com/document/doi/10.1515/zaw-2021-0001/html", }, + { + "in_doc_url": "repositorio.unicamp.br/handle/", + "in_fulltext_url": "/bitstream/", + "selector": "table.panel-body a[target='_blank']", + "attr": "href", + "technique": "PDF URL link", + "example_page": "http://www.repositorio.unicamp.br/handle/REPOSIP/287750", + }, + { + "in_doc_url": "dlc.library.columbia.edu/durst/", + "selector": "dd.blacklight-lib_non_item_in_context_url_ssm a[href]", + "attr": "href", + "technique": "Access URL link", + "example_page": "https://dlc.library.columbia.edu/durst/cul:18931zcrk9", + }, + { + "in_doc_url": "fldeploc.dep.state.fl.us/geodb_query/fgs_doi", + "in_fulltext_url": "pdf", + "selector": "p a[href]", + "attr": "href", + "technique": "PDF URL link", + "example_page": "http://fldeploc.dep.state.fl.us/geodb_query/fgs_doi.asp?searchCode=IC29", + }, + { + "in_doc_url": "preprints.jmir.org/preprint/", + "selector": "a.pdf-download-button", + "attr": "href", + "technique": "PDF URL link", + "example_page": "https://preprints.jmir.org/preprint/22556", + }, + { + "in_doc_url": "bloomsburycollections.com/", + "in_fulltext_url": "pdf", + "selector": "li.download-item a[href]", + "attr": "href", + "technique": "PDF URL link", + "example_page": "https://www.bloomsburycollections.com/book/the-political-economies-of-media-the-transformation-of-the-global-media-industries/the-political-economies-of-media-and-the-transformation-of-the-global-media-industries", + }, + { + "in_doc_url": "emerald.com/insight/content/", + "in_fulltext_url": "pdf", + "selector": "a.intent_pdf_link", + "attr": "href", + "technique": "PDF URL link", + "example_page": "https://www.emerald.com/insight/content/doi/10.1108/RAMJ-11-2020-0065/full/html", + }, + { + "in_doc_url": "ingentaconnect.com/content/", + "in_fulltext_url": "pdf", + "selector": "a.pdf[data-popup]", + "attr": "data-popup", + "technique": "PDF URL link", + "example_page": "https://www.ingentaconnect.com/content/ista/sst/2021/00000049/00000001/art00007", + }, ] FULLTEXT_URL_PATTERNS_SKIP = [ diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index eb8e256..b610ab4 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -128,8 +128,9 @@ class IngestFileWorker(SandcrawlerWorker): self.wall_blocklist = [ # loginwall "://profile.thieme.de/HTML/sso/ejournals/login.htm", - "://login.bepress.com/" - "?SAMLRequest=" + "://login.bepress.com/", + "?SAMLRequest=", + "://osapublishing.org/captcha/", ] # these are special-case web domains for which we want SPN2 to not run -- cgit v1.2.3