From 8a58ccc381534db6bbcc1275cd561ccf3a2af23a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 13 Nov 2019 21:11:49 -0800 Subject: citation_pdf_url with host-relative URLs --- python/sandcrawler/html.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 858e02a..7e1e10d 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -25,7 +25,9 @@ def extract_fulltext_url(html_url, html_body): meta = soup.find('meta', attrs={"name":"bepress_citation_pdf_url"}) if meta: url = meta['content'].strip() - if url.startswith('http'): + if url.startswith('/'): + return dict(pdf_url=host_prefix+url, technique='citation_pdf_url') + elif url.startswith('http'): return dict(pdf_url=url, technique='citation_pdf_url') else: sys.stderr.write("malformed citation_pdf_url? {}\n".format(url)) -- cgit v1.2.3