From 4bd6d443457bed67039c8d8a56a184c1f70247b6 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 14 Sep 2020 14:14:22 -0700 Subject: html parse: add another generic fulltext pattern --- python/sandcrawler/html.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'python') diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index acf1522..75e111e 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -95,13 +95,22 @@ def extract_fulltext_url(html_url, html_body): # https://pubs.acs.org/doi/10.1021/acs.estlett.9b00379 # PDF (1 MB) href = soup.find('a', attrs={"title":"PDF"}) - if href and 'href' in href: + if href and href.get('href'): url = href['href'].strip() if url.startswith('http'): return dict(pdf_url=url, technique='href_title') elif url.startswith('/'): return dict(pdf_url=host_prefix+url, technique='href_title') + # http://www.revistas.unam.mx/index.php/rep/article/view/35503/32336 + href = soup.find('a', attrs={"id":"pdfDownloadLink"}) + if href and href.get('href'): + url = href['href'].strip() + if url.startswith('http'): + return dict(pdf_url=url, technique='href_pdfDownloadLink') + elif url.startswith('/'): + return dict(pdf_url=host_prefix+url, technique='href_pdfDownloadLink') + # http://www.jasstudies.com/DergiTamDetay.aspx?ID=3401 # embed = soup.find('embed', attrs={"type": "application/pdf"}) -- cgit v1.2.3