From 90a623bd902e9abd9ce700be99d71ff7be9d00a1 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 24 Aug 2020 12:02:27 -0700 Subject: html: handle embed with mangled 'src' attribute --- python/sandcrawler/html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 6236a3b..acf1522 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -105,7 +105,7 @@ def extract_fulltext_url(html_url, html_body): # http://www.jasstudies.com/DergiTamDetay.aspx?ID=3401 # embed = soup.find('embed', attrs={"type": "application/pdf"}) - if embed: + if embed and embed.get('src'): url = embed['src'].strip() if url.startswith('/'): url = host_prefix+url -- cgit v1.2.3