From d86a87f5000b97a2dc93c4a60ba4a18e834c9e0f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 15 Sep 2020 20:26:24 -0700 Subject: html: handle JMIR URL pattern --- python/sandcrawler/html.py | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'python') diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index a5cbaf5..70761a3 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -395,6 +395,12 @@ def extract_fulltext_url(html_url, html_body): if url.encode('utf-8') in html_body: return dict(pdf_url=url, technique='href-eperiodica') + # JMIR + # https://mhealth.jmir.org/2020/7/e17891/ + if '.jmir.org/' in html_url and not "/pdf" in html_url and html_url.endswith("/"): + url = html_url + "pdf" + return dict(pdf_url=url, technique='jmir-url') + ### below here we are doing guesses # generic guess: try current URL plus .pdf, if it exists in the HTML body -- cgit v1.2.3