From 748678bc88ea31a362ec5e896fd991b3c8dcbe58 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 3 May 2020 19:38:19 -0700 Subject: hotfix for html meta extract codepath Didn't test last commit before pushing; bad Bryan! --- python/sandcrawler/html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 6e346e7..3eadc7b 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -55,7 +55,7 @@ def extract_fulltext_url(html_url, html_body): # researchgate does this; maybe others also? meta = soup.find('meta', attrs={"property":"citation_pdf_url"}) # if tag is only partially populated - if not meta.get('content'): + if meta and not meta.get('content'): meta = None # wiley has a weird almost-blank page we don't want to loop on if meta and not "://onlinelibrary.wiley.com/doi/pdf/" in html_url: -- cgit v1.2.3