From e6f2a585868b0277145659b9d653a0288f76f5b6 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 18 Feb 2020 23:08:06 -0800 Subject: allow at least researchgate does this (!) --- python/sandcrawler/html.py | 3 +++ 1 file changed, 3 insertions(+) (limited to 'python') diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index e6f0f69..8e9eb1f 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -44,6 +44,9 @@ def extract_fulltext_url(html_url, html_body): meta = soup.find('meta', attrs={"name":"citation_pdf_url"}) if not meta: meta = soup.find('meta', attrs={"name":"bepress_citation_pdf_url"}) + if not meta: + # researchgate does this; maybe others also? + meta = soup.find('meta', attrs={"property":"citation_pdf_url"}) # wiley has a weird almost-blank page we don't want to loop on if meta and not "://onlinelibrary.wiley.com/doi/pdf/" in html_url: url = meta['content'].strip() -- cgit v1.2.3