From 0897e3a5714ff8549e8ab68f44f0c49ce3f0405d Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 13 Nov 2019 16:03:02 -0800 Subject: more progress on file ingest --- python/sandcrawler/html.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'python/sandcrawler/html.py') diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 3191b66..858e02a 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -6,6 +6,7 @@ import urllib.parse from bs4 import BeautifulSoup RESEARCHSQUARE_REGEX = re.compile(r'"url":"(https://assets.researchsquare.com/files/.{1,50}/v\d+/Manuscript.pdf)"') +IEEEXPLORE_REGEX = re.compile(r'"pdfPath":"(/.*?\.pdf)"') def extract_fulltext_url(html_url, html_body): """ @@ -70,4 +71,22 @@ def extract_fulltext_url(html_url, html_body): url = urllib.parse.unquote(url) return dict(next_url=url) + # ieeexplore.ieee.org + # https://ieeexplore.ieee.org/document/8730316 + if '://ieeexplore.ieee.org/document/' in html_url: + # JSON in body with a field like: + # "pdfPath":"/iel7/6287639/8600701/08730316.pdf", + m = IEEEXPLORE_REGEX.search(html_body.decode('utf-8')) + if m: + url = m.group(1) + assert len(url) < 1024 + return dict(release_stage="published", pdf_url=host_prefix+url) + # https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=8730313 + if '://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber' in html_url: + # HTML iframe like: + # + iframe = soup.find("iframe") + if iframe and '.pdf' in iframe['src']: + return dict(pdf_url=iframe['src']) + return dict() -- cgit v1.2.3