html: handle embed with mangled 'src' attribute

author: Bryan Newbold <bnewbold@archive.org> 2020-08-24 12:02:27 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-08-24 12:02:27 -0700
commit: 90a623bd902e9abd9ce700be99d71ff7be9d00a1 (patch)
tree: 52a1b5061ecbe52fe455d769d6d21a9ce9e359fc
parent: 6af81aad4c780ec5892f135a5a8e231ac08fb1e0 (diff)
download: sandcrawler-90a623bd902e9abd9ce700be99d71ff7be9d00a1.tar.gz
sandcrawler-90a623bd902e9abd9ce700be99d71ff7be9d00a1.zip
1 files changed, 1 insertions, 1 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 6236a3b..acf1522 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -105,7 +105,7 @@ def extract_fulltext_url(html_url, html_body):
     # http://www.jasstudies.com/DergiTamDetay.aspx?ID=3401
     # <embed src="/files/jass_makaleler/1359848334_33-Okt.%20Yasemin%20KARADEM%C4%B0R.pdf" type="application/pdf" />
     embed = soup.find('embed', attrs={"type": "application/pdf"})
-    if embed:
+    if embed and embed.get('src'):
         url = embed['src'].strip()
         if url.startswith('/'):
             url = host_prefix+url
author	Bryan Newbold <bnewbold@archive.org>	2020-08-24 12:02:27 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-08-24 12:02:27 -0700
commit	90a623bd902e9abd9ce700be99d71ff7be9d00a1 (patch)
tree	52a1b5061ecbe52fe455d769d6d21a9ce9e359fc
parent	6af81aad4c780ec5892f135a5a8e231ac08fb1e0 (diff)
download	sandcrawler-90a623bd902e9abd9ce700be99d71ff7be9d00a1.tar.gz sandcrawler-90a623bd902e9abd9ce700be99d71ff7be9d00a1.zip