From e608c22854c8796619e8d6cac1264a3e936eb9e9 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sat, 22 Feb 2020 17:44:53 -0800 Subject: html: more publisher-specific fulltext extraction tricks --- python/sandcrawler/html.py | 47 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'python') diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 2b173b0..091162d 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -1,6 +1,7 @@ import re import sys +import json import urllib.parse from bs4 import BeautifulSoup @@ -258,4 +259,50 @@ def extract_fulltext_url(html_url, html_body): if url.startswith('http') and 'pdfs.journals.lww.com' in url: return dict(pdf_url=url, technique='journals.lww.com-jsvar') + # www.ahajournals.org + # https://www.ahajournals.org/doi/10.1161/circ.110.19.2977 + if "://www.ahajournals.org/doi/" in html_url and not '/doi/pdf/' in html_url: + # PDF download + if b'/doi/pdf/10.' in html_body: + url = html_url.replace('/doi/10.', '/doi/pdf/10.') + url = url + "?download=true" + return dict(pdf_url=url, technique='ahajournals-url') + + # ehp.niehs.nih.gov + # https://ehp.niehs.nih.gov/doi/full/10.1289/EHP4709 + if "://ehp.niehs.nih.gov/doi/full/" in html_url: + # + if b'/doi/pdf/10.' in html_body: + url = html_url.replace('/doi/full/10.', '/doi/pdf/10.') + return dict(pdf_url=url, technique='ehp.niehs.nigh.gov-url') + + # journals.tsu.ru (and maybe others) + # http://journals.tsu.ru/psychology/&journal_page=archive&id=1815&article_id=40405 + # Скачать электронную версию публикации + href = soup.find('a', attrs={"class":"file pdf"}) + if href: + url = href['href'].strip() + if url.startswith('http'): + return dict(pdf_url=url, technique='href_file_pdf-pdf') + + # cogentoa.com + # https://www.cogentoa.com/article/10.1080/23311975.2017.1412873 + if "://www.cogentoa.com/article/" in html_url and not ".pdf" in html_url: + # blech, it's a SPA! All JS + # https://www.cogentoa.com/article/10.1080/23311975.2017.1412873.pdf + url = html_url + ".pdf" + return dict(pdf_url=url, technique='cogentoa-url') + + # chemrxiv.org (likely to be other figshare domains also) + # https://chemrxiv.org/articles/Biradical_Formation_by_Deprotonation_in_Thiazole-Derivatives_The_Hidden_Nature_of_Dasatinib/10101419 + if "://chemrxiv.org/articles/" in html_url or '.figshare.org/articles/' in html_url: + # + json_tag = soup.find('script', id="app-data", attrs={"type": "text/json"}) + if json_tag.string: + app_data = json.loads(json_tag.string) + # "exportPdfDownloadUrl": "https://s3-eu-west-1.amazonaws.com/itempdf74155353254prod/10101419/Biradical_Formation_by_Deprotonation_in_Thiazole-Derivatives__The_Hidden_Nature_of_Dasatinib_v1.pdf" + url = app_data.get('article', {}).get('exportPdfDownloadUrl') + if url and url.startswith('http'): + return dict(pdf_url=url, technique='figshare-json') + return dict() -- cgit v1.2.3