diff options
| -rw-r--r-- | python/sandcrawler/html.py | 3 | 
1 files changed, 3 insertions, 0 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 8fbb0ba..6e346e7 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -54,6 +54,9 @@ def extract_fulltext_url(html_url, html_body):      if not meta:          # researchgate does this; maybe others also?          meta = soup.find('meta', attrs={"property":"citation_pdf_url"}) +    # if tag is only partially populated +    if not meta.get('content'): +        meta = None      # wiley has a weird almost-blank page we don't want to loop on      if meta and not "://onlinelibrary.wiley.com/doi/pdf/" in html_url:          url = meta['content'].strip()  | 
