aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-22 17:44:53 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-22 17:44:53 -0800
commite608c22854c8796619e8d6cac1264a3e936eb9e9 (patch)
tree6c2bff55502c1d54aed95f6bd55171113db5e355
parentfbfcb3cc2215613d972e589eaad519ea726b5d31 (diff)
downloadsandcrawler-e608c22854c8796619e8d6cac1264a3e936eb9e9.tar.gz
sandcrawler-e608c22854c8796619e8d6cac1264a3e936eb9e9.zip
html: more publisher-specific fulltext extraction tricks
-rw-r--r--python/sandcrawler/html.py47
1 files changed, 47 insertions, 0 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 2b173b0..091162d 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -1,6 +1,7 @@
import re
import sys
+import json
import urllib.parse
from bs4 import BeautifulSoup
@@ -258,4 +259,50 @@ def extract_fulltext_url(html_url, html_body):
if url.startswith('http') and 'pdfs.journals.lww.com' in url:
return dict(pdf_url=url, technique='journals.lww.com-jsvar')
+ # www.ahajournals.org
+ # https://www.ahajournals.org/doi/10.1161/circ.110.19.2977
+ if "://www.ahajournals.org/doi/" in html_url and not '/doi/pdf/' in html_url:
+ # <a href="/doi/pdf/10.1161/circ.110.19.2977?download=true">PDF download</a>
+ if b'/doi/pdf/10.' in html_body:
+ url = html_url.replace('/doi/10.', '/doi/pdf/10.')
+ url = url + "?download=true"
+ return dict(pdf_url=url, technique='ahajournals-url')
+
+ # ehp.niehs.nih.gov
+ # https://ehp.niehs.nih.gov/doi/full/10.1289/EHP4709
+ if "://ehp.niehs.nih.gov/doi/full/" in html_url:
+ # <a href="/doi/pdf/10.1289/EHP4709" target="_blank">
+ if b'/doi/pdf/10.' in html_body:
+ url = html_url.replace('/doi/full/10.', '/doi/pdf/10.')
+ return dict(pdf_url=url, technique='ehp.niehs.nigh.gov-url')
+
+ # journals.tsu.ru (and maybe others)
+ # http://journals.tsu.ru/psychology/&journal_page=archive&id=1815&article_id=40405
+ # <a class='file pdf' href='http://journals.tsu.ru/engine/download.php?id=150921&area=files'>Скачать электронную версию публикации</a>
+ href = soup.find('a', attrs={"class":"file pdf"})
+ if href:
+ url = href['href'].strip()
+ if url.startswith('http'):
+ return dict(pdf_url=url, technique='href_file_pdf-pdf')
+
+ # cogentoa.com
+ # https://www.cogentoa.com/article/10.1080/23311975.2017.1412873
+ if "://www.cogentoa.com/article/" in html_url and not ".pdf" in html_url:
+ # blech, it's a SPA! All JS
+ # https://www.cogentoa.com/article/10.1080/23311975.2017.1412873.pdf
+ url = html_url + ".pdf"
+ return dict(pdf_url=url, technique='cogentoa-url')
+
+ # chemrxiv.org (likely to be other figshare domains also)
+ # https://chemrxiv.org/articles/Biradical_Formation_by_Deprotonation_in_Thiazole-Derivatives_The_Hidden_Nature_of_Dasatinib/10101419
+ if "://chemrxiv.org/articles/" in html_url or '.figshare.org/articles/' in html_url:
+ # <script id="app-data" type="text/json"> [...] </script>
+ json_tag = soup.find('script', id="app-data", attrs={"type": "text/json"})
+ if json_tag.string:
+ app_data = json.loads(json_tag.string)
+ # "exportPdfDownloadUrl": "https://s3-eu-west-1.amazonaws.com/itempdf74155353254prod/10101419/Biradical_Formation_by_Deprotonation_in_Thiazole-Derivatives__The_Hidden_Nature_of_Dasatinib_v1.pdf"
+ url = app_data.get('article', {}).get('exportPdfDownloadUrl')
+ if url and url.startswith('http'):
+ return dict(pdf_url=url, technique='figshare-json')
+
return dict()