import re import sys import urllib.parse from bs4 import BeautifulSoup RESEARCHSQUARE_REGEX = re.compile(r'"url":"(https://assets.researchsquare.com/files/.{1,50}/v\d+/Manuscript.pdf)"') IEEEXPLORE_REGEX = re.compile(r'"pdfPath":"(/.*?\.pdf)"') OVID_JOURNAL_URL_REGEX = re.compile(r'journalURL = "(http.*)";') def test_regex(): lines = """ blah var journalURL = "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689"; asdf""" m = OVID_JOURNAL_URL_REGEX.search(lines) assert m.group(1) == "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689" def extract_fulltext_url(html_url, html_body): """ Takes an HTML document (and URL), assumed to be a landing page, and tries to find a fulltext PDF url. """ host_prefix = '/'.join(html_url.split('/')[:3]) soup = BeautifulSoup(html_body, 'html.parser') ### General Tricks ### # highwire-style meta tag meta = soup.find('meta', attrs={"name":"citation_pdf_url"}) if not meta: meta = soup.find('meta', attrs={"name":"bepress_citation_pdf_url"}) # wiley has a weird almost-blank page we don't want to loop on if meta and not "://onlinelibrary.wiley.com/doi/pdf/" in html_url: url = meta['content'].strip() if url.startswith('/'): return dict(pdf_url=host_prefix+url, technique='citation_pdf_url') elif url.startswith('http'): return dict(pdf_url=url, technique='citation_pdf_url') else: print("malformed citation_pdf_url? {}".format(url), file=sys.stderr) # sage, and also utpjournals (see below) # https://journals.sagepub.com/doi/10.1177/2309499019888836 # # href = soup.find('a', attrs={"class":"show-pdf"}) if href: url = href['href'].strip() if url.startswith('http'): return dict(pdf_url=url, technique='href_show-pdf') # ACS (and probably others) like: # https://pubs.acs.org/doi/10.1021/acs.estlett.9b00379 # PDF (1 MB) href = soup.find('a', attrs={"title":"PDF"}) if href: url = href['href'].strip() if url.startswith('http'): return dict(pdf_url=url, technique='href_title') elif url.startswith('/'): return dict(pdf_url=host_prefix+url, technique='href_title') # http://www.jasstudies.com/DergiTamDetay.aspx?ID=3401 # embed = soup.find('embed', attrs={"type": "application/pdf"}) if embed: url = embed['src'].strip() if url.startswith('/'): url = host_prefix+url if url.startswith('http'): return dict(pdf_url=url, technique='embed_type') ### Publisher/Platform Specific ### # eLife (elifesciences.org) if '://elifesciences.org/articles/' in html_url: anchor = soup.find("a", attrs={"data-download-type": "pdf-article"}) if anchor: url = anchor['href'].strip() assert '.pdf' in url return dict(pdf_url=url, technique='publisher') # research square (researchsquare.com) if 'researchsquare.com/article/' in html_url: # JSON in body with a field like: # "url":"https://assets.researchsquare.com/files/4a57970e-b002-4608-b507-b95967649483/v2/Manuscript.pdf" m = RESEARCHSQUARE_REGEX.search(html_body.decode('utf-8')) if m: url = m.group(1) assert len(url) < 1024 return dict(release_stage="manuscript", pdf_url=url, technique='publisher') # elseiver linking hub # https://linkinghub.elsevier.com/retrieve/pii/S1569199319308975 if '://linkinghub.elsevier.com/retrieve/pii/' in html_url: # redirect = soup.find("input", attrs={"name": "redirectURL"}) if redirect: url = redirect['value'].strip() if 'http' in url: url = urllib.parse.unquote(url) # drop any the query parameter url = url.split('?via')[0] return dict(next_url=url, technique="elsevier-linkinghub") # ieeexplore.ieee.org # https://ieeexplore.ieee.org/document/8730316 if '://ieeexplore.ieee.org/document/' in html_url: # JSON in body with a field like: # "pdfPath":"/iel7/6287639/8600701/08730316.pdf", m = IEEEXPLORE_REGEX.search(html_body.decode('utf-8')) if m: url = m.group(1) assert len(url) < 1024 return dict(release_stage="published", pdf_url=host_prefix+url, technique="publisher") # https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=8730313 if '://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber' in html_url: # HTML iframe like: # iframe = soup.find("iframe") if iframe and '.pdf' in iframe['src']: return dict(pdf_url=iframe['src'], technique="iframe") # utpjournals.press # https://utpjournals.press/doi/10.3138/cjh.ach.54.1-2.05 if '://utpjournals.press/doi/10.' in html_url: # href = soup.find('a', attrs={"class":"show-pdf"}) if href: url = href['href'].strip() if url.startswith('http'): return dict(pdf_url=url, technique='publisher-href') # https://www.jcancer.org/v10p4038.htm # simple journal-specific href if '://www.jcancer.org/' in html_url and html_url.endswith(".htm"): # PDF href = soup.find('a', attrs={"class":"textbutton"}) if href: url = href['href'].strip() if url.endswith(".pdf") and not "http" in url: return dict(pdf_url=host_prefix+"/"+url, technique='journal-href') # https://insights.ovid.com/crossref?an=00042307-202001000-00013 # Ovid is some kind of landing page bounce portal tracking run-around. # Can extract actual journal URL from javascript blob in the HTML if '://insights.ovid.com/crossref' in html_url: # var journalURL = "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689"; m = OVID_JOURNAL_URL_REGEX.search(html_body.decode('utf-8')) if m: url = m.group(1) assert len(url) < 1024 return dict(next_url=url, technique='ovid') # osf.io # https://osf.io/8phvx/ # https://osf.io/preprints/socarxiv/8phvx/ # wow, they ship total javascript crud! going to just guess download URL # based on URL for now. Maybe content type header would help? if '://osf.io/' in html_url and not '/download' in html_url: if not html_url.endswith("/"): next_url = html_url+"/download" else: next_url = html_url+"download" return dict(next_url=next_url, technique='osf-by-url') # wiley # https://onlinelibrary.wiley.com/doi/pdf/10.1111/1467-923X.12787 if "://onlinelibrary.wiley.com/doi/pdf/" in html_url: if "/doi/pdfdirect/" in html_body: next_url = html_url.replace('/doi/pdf/', '/doi/pdfdirect/') return dict(next_url=next_url, technique='wiley-pdfdirect') # taylor and frances # https://www.tandfonline.com/doi/full/10.1080/19491247.2019.1682234 # if "://www.tandfonline.com/doi/full/10." in html_url: href = soup.find('a', attrs={"class":"show-pdf"}) if href: url = href['href'].strip() if "/pdf/" in url: return dict(pdf_url=host_prefix+url, technique='publisher') return dict()