citation_pdf_url with host-relative URLs

author: Bryan Newbold <bnewbold@archive.org> 2019-11-13 21:11:49 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2019-11-13 21:11:49 -0800
commit: 8a58ccc381534db6bbcc1275cd561ccf3a2af23a (patch)
tree: a7786ce89106180d4d71cdcf5d174aa496fa5994 /python
parent: 9e97a11c1ef26ee54060b6ef7e23052d26b54b4a (diff)
download: sandcrawler-8a58ccc381534db6bbcc1275cd561ccf3a2af23a.tar.gz
sandcrawler-8a58ccc381534db6bbcc1275cd561ccf3a2af23a.zip
1 files changed, 3 insertions, 1 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 858e02a..7e1e10d 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -25,7 +25,9 @@ def extract_fulltext_url(html_url, html_body):
         meta = soup.find('meta', attrs={"name":"bepress_citation_pdf_url"})
     if meta:
         url = meta['content'].strip()
-        if url.startswith('http'):
+        if url.startswith('/'):
+            return dict(pdf_url=host_prefix+url, technique='citation_pdf_url')
+        elif url.startswith('http'):
             return dict(pdf_url=url, technique='citation_pdf_url')
         else:
             sys.stderr.write("malformed citation_pdf_url? {}\n".format(url))
author	Bryan Newbold <bnewbold@archive.org>	2019-11-13 21:11:49 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2019-11-13 21:11:49 -0800
commit	8a58ccc381534db6bbcc1275cd561ccf3a2af23a (patch)
tree	a7786ce89106180d4d71cdcf5d174aa496fa5994 /python
parent	9e97a11c1ef26ee54060b6ef7e23052d26b54b4a (diff)
download	sandcrawler-8a58ccc381534db6bbcc1275cd561ccf3a2af23a.tar.gz sandcrawler-8a58ccc381534db6bbcc1275cd561ccf3a2af23a.zip