diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-10-22 21:34:40 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-10-22 21:35:00 -0700 |
commit | 2e3611f0e66615ae007d4e46bb5905e2220fb690 (patch) | |
tree | 9d2fa6f8d62145a5ab31f37f26b6c293a2163acd /python/sandcrawler/html.py | |
parent | b11fe8c8f444756ae246250cbbfe44e7dc62eac3 (diff) | |
download | sandcrawler-2e3611f0e66615ae007d4e46bb5905e2220fb690.tar.gz sandcrawler-2e3611f0e66615ae007d4e46bb5905e2220fb690.zip |
much progress on file ingest path
Diffstat (limited to 'python/sandcrawler/html.py')
-rw-r--r-- | python/sandcrawler/html.py | 73 |
1 files changed, 73 insertions, 0 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py new file mode 100644 index 0000000..3191b66 --- /dev/null +++ b/python/sandcrawler/html.py @@ -0,0 +1,73 @@ + +import re +import sys +import urllib.parse + +from bs4 import BeautifulSoup + +RESEARCHSQUARE_REGEX = re.compile(r'"url":"(https://assets.researchsquare.com/files/.{1,50}/v\d+/Manuscript.pdf)"') + +def extract_fulltext_url(html_url, html_body): + """ + Takes an HTML document (and URL), assumed to be a landing page, and tries + to find a fulltext PDF url. + """ + + host_prefix = '/'.join(html_url.split('/')[:3]) + soup = BeautifulSoup(html_body, 'html.parser') + + ### General Tricks ### + + # highwire-style meta tag + meta = soup.find('meta', attrs={"name":"citation_pdf_url"}) + if not meta: + meta = soup.find('meta', attrs={"name":"bepress_citation_pdf_url"}) + if meta: + url = meta['content'].strip() + if url.startswith('http'): + return dict(pdf_url=url, technique='citation_pdf_url') + else: + sys.stderr.write("malformed citation_pdf_url? {}\n".format(url)) + + # ACS (and probably others) like: + # https://pubs.acs.org/doi/10.1021/acs.estlett.9b00379 + # <a href="/doi/pdf/10.1021/acs.estlett.9b00379" title="PDF" target="_blank" class="button_primary"><i class="icon-file-pdf-o"></i><span>PDF (1 MB)</span></a> + href = soup.find('a', attrs={"title":"PDF"}) + if href: + url = href['href'].strip() + if url.startswith('http'): + return dict(pdf_url=url, technique='href_title') + elif url.startswith('/'): + return dict(pdf_url=host_prefix+url, technique='href_title') + + ### Publisher/Platform Specific ### + + # eLife (elifesciences.org) + if '://elifesciences.org/articles/' in html_url: + anchor = soup.find("a", attrs={"data-download-type": "pdf-article"}) + if anchor: + url = anchor['href'].strip() + assert '.pdf' in url + return dict(pdf_url=url) + + # research square (researchsquare.com) + if 'researchsquare.com/article/' in html_url: + # JSON in body with a field like: + # "url":"https://assets.researchsquare.com/files/4a57970e-b002-4608-b507-b95967649483/v2/Manuscript.pdf" + m = RESEARCHSQUARE_REGEX.search(html_body.decode('utf-8')) + if m: + url = m.group(1) + assert len(url) < 1024 + return dict(release_stage="manuscript", pdf_url=url) + + # ehp.niehs.nih.gov + # <a href="/doi/pdf/10.1289/EHP3950"> + if '://linkinghub.elsevier.com/retrieve/pii/' in html_url: + redirect = soup.find("input", attrs={"name": "redirectURL"}) + if redirect: + url = redirect['value'].strip() + if 'sciencedirect.com' in url: + url = urllib.parse.unquote(url) + return dict(next_url=url) + + return dict() |