import json import re import sys import urllib.parse from typing import Any, Dict from bs4 import BeautifulSoup RESEARCHSQUARE_REGEX = re.compile( r'"url":"({1,50}/v\d+/Manuscript.pdf)"' ) IEEEXPLORE_REGEX = re.compile(r'"pdfPath":"(/.*?\.pdf)"') OVID_JOURNAL_URL_REGEX = re.compile(r'journalURL = "(http.*)";') SCIENCEDIRECT_BOUNCE_URL_REGEX = re.compile(r"window.location = '(http.*)';") def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]: """ Takes an HTML document (and URL), assumed to be a landing page, and tries to find a fulltext PDF url. On error, or if fails to extract a URL, returns an empty dict. """ host_prefix = "/".join(html_url.split("/")[:3]) try: soup = BeautifulSoup(html_body, "html.parser") except TypeError as te: print(f"{te} (url={html_url})", file=sys.stderr) return dict() except UnboundLocalError as ule: print(f"{ule} (url={html_url})", file=sys.stderr) return dict() # ignoring most type checks on bs4 output in this function (which is partially deprecated) meta: Any url: Any redirect: Any ### General Tricks ### # highwire-style meta tag meta = soup.find("meta", attrs={"name": "citation_pdf_url"}) if not meta: meta = soup.find("meta", attrs={"name": "bepress_citation_pdf_url"}) if not meta: meta = soup.find("meta", attrs={"name": "wkhealth_pdf_url"}) if not meta: # researchgate does this; maybe others also? meta = soup.find("meta", attrs={"property": "citation_pdf_url"}) if not meta: meta = soup.find("meta", attrs={"name": "eprints.document_url"}) # if tag is only partially populated if meta and not meta.get("content"): meta = None # wiley has a weird almost-blank page we don't want to loop on if meta and "://" not in html_url: url = meta["content"].strip() if "://" in url: print(f"\ in citation_pdf_url (loop?): {url}", file=sys.stderr) elif url.startswith("/"): if host_prefix + url == html_url: print("\tavoiding citation_pdf_url link-loop", file=sys.stderr) else: return dict(pdf_url=host_prefix + url, technique="citation_pdf_url") elif url.startswith("http"): if url == html_url: print("\tavoiding citation_pdf_url link-loop", file=sys.stderr) else: return dict(pdf_url=url, technique="citation_pdf_url") else: print("\tmalformed citation_pdf_url? {}".format(url), file=sys.stderr) meta = soup.find("meta", attrs={"name": "generator"}) meta_generator = None if meta and meta.get("content"): meta_generator = meta["content"].strip() ### Publisher/Platform Specific ### # research square ( if "" in html_url: # JSON in body with a field like: # "url":"" m ="utf-8")) if m: url = assert len(url) < 4096 return dict(release_stage="manuscript", pdf_url=url, technique="publisher") # elseiver linking hub # if "://" in html_url: # redirect = soup.find("input", attrs={"name": "redirectURL"}) if redirect: url = redirect["value"].strip() if "http" in url: url = urllib.parse.unquote(url) # drop any the query parameter url = url.split("?via")[0] return dict(next_url=url, technique="elsevier-linkinghub") # sciencedirect PDF URL extract # if "" in html_url and not html_url.endswith(".pdf"): json_tag: Any = soup.find( "script", attrs={"type": "application/json", "data-iso-key": "_0"} ) url = None if json_tag: try: json_text = json_tag.string json_meta = json.loads(json_text) pdf_meta = json_meta["article"]["pdfDownload"]["urlMetadata"] # url = ( html_url + pdf_meta["pdfExtension"] + "?md5=" + pdf_meta["queryParams"]["md5"] + "&pid=" + pdf_meta["queryParams"]["pid"] ) except (KeyError, TypeError, json.JSONDecodeError): pass if url: return dict(pdf_url=url, technique="sciencedirect-munge-json") # sciencedirect PDF bounce page # if "://" in html_url and html_url.endswith(".pdf"): # window.location = '[...]&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20200110T210936Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTY23CMDBNC%2F20200110%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=[...]&hash=[...]&host=[...]&pii=S2590109519300424&tid=spdf-74468ebd-6be6-43ac-b294-ced86e8eea58&sid=[...]&type=client'; m ="utf-8")) if m: url = assert len(url) < 4000 return dict(pdf_url=url, technique="sciencedirect-bounce") # # if "://" in html_url: # JSON in body with a field like: # "pdfPath":"/iel7/6287639/8600701/08730316.pdf", m ="utf-8")) if m: url = assert len(url) < 4096 return dict( release_stage="published", pdf_url=host_prefix + url, technique="ieeexplore" ) # if "://" in html_url: # HTML iframe like: # iframe: Any = soup.find("iframe") if iframe and ".pdf" in iframe["src"]: return dict(pdf_url=iframe["src"], technique="iframe") # # Ovid is some kind of landing page bounce portal tracking run-around. # Can extract actual journal URL from javascript blob in the HTML if "://" in html_url: # var journalURL = ""; m ="utf-8")) if m: url = assert len(url) < 4096 return dict(next_url=url, technique="ovid") # # # # wow, they ship total javascript crud! going to just guess download URL # based on URL for now. Maybe content type header would help? OSF_DOMAINS = [ "://", "://", "://", "://", "://", "://", "://", "://", "://", "://", "://", "://", "://", "://", ] for domain in OSF_DOMAINS: if ( domain in html_url and (len(html_url.split("/")) in [4, 5] or "/preprints/" in html_url) and "/download" not in html_url ): if not html_url.endswith("/"): next_url = html_url + "/download" else: next_url = html_url + "download" return dict(next_url=next_url, technique="osf-by-url") # wiley # if "://" in html_url: if b"/doi/pdfdirect/" in html_body: next_url = html_url.replace("/doi/pdf/", "/doi/pdfdirect/") return dict(next_url=next_url, technique="wiley-pdfdirect") # arxiv abstract pages if "://" in html_url: url = html_url.replace("/abs/", "/pdf/") return dict(pdf_url=url, technique="arxiv-url") # american archivist (OA) # if "://" in html_url and "/doi/pdf" not in html_url: # use a more aggressive direct guess to avoid rate-limiting... if "/doi/10." in html_url: url = html_url.replace("/doi/10.", "/doi/pdf/10.") return dict(pdf_url=url, technique="archivist-url") # hrefs = soup.find_all("a", attrs={"target": "_blank"}) for href in hrefs: url = href["href"].strip() if "/doi/pdf/" in url: if url.startswith("http"): return dict(pdf_url=url, technique="publisher-href") elif url.startswith("/"): return dict(pdf_url=host_prefix + url, technique="publisher-href") # # if "://" in html_url and not html_url.endswith(".pdf"): url = html_url + ".pdf" return dict(pdf_url=url, technique="protocolsio-url") # # if "://" in html_url and html_url.endswith(".xml"): url = html_url.replace("/view/", "/downloadpdf/").replace(".xml", ".pdf") return dict(pdf_url=url, technique="degruyter-url") # (Wolters Kluwer) # # DISABLED: they seem to redirect our crawler back to a "Fulltext" page and # we never get the content. if "://" in html_url and False: # data-pdf-url="|ExpireAbsolute;source|Journals;ttl|1582413672903;payload|mY8D3u1TCCsNvP5E421JYK6N6XICDamxByyYpaNzk7FKjTaa1Yz22MivkHZqjGP4kdS2v0J76WGAnHACH69s21Csk0OpQi3YbjEMdSoz2UhVybFqQxA7lKwSUlA502zQZr96TQRwhVlocEp/sJ586aVbcBFlltKNKo+tbuMfL73hiPqJliudqs17cHeLcLbV/CqjlP3IO0jGHlHQtJWcICDdAyGJMnpi6RlbEJaRheGeh5z5uvqz3FLHgPKVXJzdiVgCTnUeUQFYzcJRFhNtc2gv+ECZGji7HUicj1/6h85Y07DBRl1x2MGqlHWXUawD;hash|6cqYBa15ZK407m4VhFfJLw==" for line in html_body.split(b"\n"): if b"data-pdf-url=" in line: line = line.decode("utf-8") url = line.strip().replace("data-pdf-url=", "").replace('"', "") if url.startswith("http") and "" in url: return dict(pdf_url=url, technique="") # # if "://" in html_url and "/doi/pdf/" not in html_url: # PDF download if b"/doi/pdf/10." in html_body: url = html_url.replace("/doi/10.", "/doi/pdf/10.") url = url + "?download=true" return dict(pdf_url=url, technique="ahajournals-url") # # # if "://" in html_url: # if b"/doi/pdf/10." in html_body: url = html_url.replace("/doi/full/10.", "/doi/pdf/10.").replace( "/doi/10.", "/doi/pdf/10." ) return dict(pdf_url=url, technique="") # # if "://" in html_url and ".pdf" not in html_url: # blech, it's a SPA! All JS # url = html_url + ".pdf" return dict(pdf_url=url, technique="cogentoa-url") # (likely to be other figshare domains also) # if "://" in html_url or "" in html_url: # json_tag = soup.find("script", id="app-data", attrs={"type": "text/json"}) if json_tag and json_tag.string: app_data = json.loads(json_tag.string) # "exportPdfDownloadUrl": "" url = app_data.get("article", {}).get("exportPdfDownloadUrl") if url and url.startswith("http"): return dict(pdf_url=url, technique="figshare-json") # CNKI COVID-19 landing pages # if "://" in html_url: # PDF Download href = soup.find("a", attrs={"id": "pdfDown"}) if href: url = href["href"].strip().replace(" ", "") if not url.startswith("http"): url = host_prefix + url return dict(pdf_url=url, technique="cnki-href") # RWTH AACHEN repository if "://" in html_url: record_id = html_url.split("/")[-1] url = f"{html_url}/files/{record_id}.pdf" if record_id.isdigit() and url.encode("utf-8") in html_body: return dict(pdf_url=url, technique="rwth-aachen-url") # if "://" in html_url and soup: for href in soup.find_all("a"): if href.text == "download PDF file": url = href["href"] if url.startswith("/"): url = host_prefix + url return dict(pdf_url=url, technique="physchemaspects-href") # OJS 3 (some) if meta_generator and meta_generator.startswith("Open Journal Systems"): href = soup.find("a", attrs={"class": "obj_galley_link file"}) if href and href.text and "pdf" in href.text.lower(): url = href["href"].strip() if url.startswith("/"): url = host_prefix + url return dict(pdf_url=url, technique="ojs-galley-href") # ETH zurich e-periodica if "://" in html_url: url = html_url.replace("digbib/view", "cntmng").split("#")[0] if url.encode("utf-8") in html_body: return dict(pdf_url=url, technique="href-eperiodica") # JMIR # if "" in html_url and "/pdf" not in html_url and html_url.endswith("/"): url = html_url + "pdf" return dict(pdf_url=url, technique="jmir-url") # Google Drive # this is assuming it is a PDF if "" in html_url and "/view" in html_url: gdrive_id = html_url.split('/')[5] if len(gdrive_id) > 10: # return dict(pdf_url=f"{gdrive_id}", technique="google-drive") ### below here we are doing guesses # generic guess: try current URL plus .pdf, if it exists in the HTML body if ".pdf" not in html_url: url = html_url + ".pdf" if url.encode("utf-8") in html_body: return dict(pdf_url=url, technique="guess-url-plus-pdf") return dict() def test_regex() -> None: lines = """ blah var journalURL = ""; asdf""" m = assert m assert ( == "" ) lines = """ window.onload = function () { window.location = ''; refreshOriginalWindow(); } """ url = "" m = assert m assert == url