From a61dd0c429b9e6d24987e14cd5d66057adb498da Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 3 May 2020 19:21:38 -0700 Subject: ingest: handle partial citation_pdf_url tag Eg: https://www.cureus.com/articles/29935-a-nomogram-for-the-rapid-prediction-of-hematocrit-following-blood-loss-and-fluid-shifts-in-neonates-infants-and-adults Has: --- python/sandcrawler/html.py | 3 +++ 1 file changed, 3 insertions(+) (limited to 'python') diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 8fbb0ba..6e346e7 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -54,6 +54,9 @@ def extract_fulltext_url(html_url, html_body): if not meta: # researchgate does this; maybe others also? meta = soup.find('meta', attrs={"property":"citation_pdf_url"}) + # if tag is only partially populated + if not meta.get('content'): + meta = None # wiley has a weird almost-blank page we don't want to loop on if meta and not "://onlinelibrary.wiley.com/doi/pdf/" in html_url: url = meta['content'].strip() -- cgit v1.2.3