aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-05-03 19:21:38 -0700
committerBryan Newbold <bnewbold@archive.org>2020-05-03 19:21:40 -0700
commita61dd0c429b9e6d24987e14cd5d66057adb498da (patch)
tree6d4189ff95cdfb905c30c0791eb3b19e339627af
parent2d85d55ed1af4220f1469f4cf0cddd7e0888b3b3 (diff)
downloadsandcrawler-a61dd0c429b9e6d24987e14cd5d66057adb498da.tar.gz
sandcrawler-a61dd0c429b9e6d24987e14cd5d66057adb498da.zip
ingest: handle partial citation_pdf_url tag
Eg: https://www.cureus.com/articles/29935-a-nomogram-for-the-rapid-prediction-of-hematocrit-following-blood-loss-and-fluid-shifts-in-neonates-infants-and-adults Has: <meta name="citation_pdf_url"/>
-rw-r--r--python/sandcrawler/html.py3
1 files changed, 3 insertions, 0 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 8fbb0ba..6e346e7 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -54,6 +54,9 @@ def extract_fulltext_url(html_url, html_body):
if not meta:
# researchgate does this; maybe others also?
meta = soup.find('meta', attrs={"property":"citation_pdf_url"})
+ # if tag is only partially populated
+ if not meta.get('content'):
+ meta = None
# wiley has a weird almost-blank page we don't want to loop on
if meta and not "://onlinelibrary.wiley.com/doi/pdf/" in html_url:
url = meta['content'].strip()