From b91987978330b6da1c6bb5650418c2c6068340a1 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 25 Mar 2020 16:29:12 -0700 Subject: ingest: eurosurveillance PDF parser --- python/sandcrawler/html.py | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'python/sandcrawler') diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index c76d7a2..04b3afe 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -310,4 +310,15 @@ def extract_fulltext_url(html_url, html_body): if url and url.startswith('http'): return dict(pdf_url=url, technique='figshare-json') + # eurosurveillance + # https://www.eurosurveillance.org/content/10.2807/1560-7917.ES.2020.25.11.2000230 + if "://www.eurosurveillance.org/content/" in html_url: + # + href = soup.find('a', attrs={"class":"pdf", "title": "Download"}) + if href: + url = href['href'].strip() + if not url.startswith('http'): + url = host_prefix + url + return dict(pdf_url=url, technique='eurosurveillance-href') + return dict() -- cgit v1.2.3