diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2019-11-13 21:11:49 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2019-11-13 21:11:49 -0800 | 
| commit | 8a58ccc381534db6bbcc1275cd561ccf3a2af23a (patch) | |
| tree | a7786ce89106180d4d71cdcf5d174aa496fa5994 /python | |
| parent | 9e97a11c1ef26ee54060b6ef7e23052d26b54b4a (diff) | |
| download | sandcrawler-8a58ccc381534db6bbcc1275cd561ccf3a2af23a.tar.gz sandcrawler-8a58ccc381534db6bbcc1275cd561ccf3a2af23a.zip | |
citation_pdf_url with host-relative URLs
Diffstat (limited to 'python')
| -rw-r--r-- | python/sandcrawler/html.py | 4 | 
1 files changed, 3 insertions, 1 deletions
| diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 858e02a..7e1e10d 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -25,7 +25,9 @@ def extract_fulltext_url(html_url, html_body):          meta = soup.find('meta', attrs={"name":"bepress_citation_pdf_url"})      if meta:          url = meta['content'].strip() -        if url.startswith('http'): +        if url.startswith('/'): +            return dict(pdf_url=host_prefix+url, technique='citation_pdf_url') +        elif url.startswith('http'):              return dict(pdf_url=url, technique='citation_pdf_url')          else:              sys.stderr.write("malformed citation_pdf_url? {}\n".format(url)) | 
