diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-08-11 00:28:06 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-08-11 00:28:06 -0700 |
commit | 33cc50939619d1c30bdfa800aba2137397a7ee0d (patch) | |
tree | cbd1bef13d061a97f6ce14138447dc2d11d396c1 | |
parent | 26d4766a2835aab00b0201198376d4ca42cc1d82 (diff) | |
download | sandcrawler-33cc50939619d1c30bdfa800aba2137397a7ee0d.tar.gz sandcrawler-33cc50939619d1c30bdfa800aba2137397a7ee0d.zip |
html: extract eprints PDF url (eg, ub.uni-heidelberg.de)
-rw-r--r-- | python/sandcrawler/html.py | 2 |
1 files changed, 2 insertions, 0 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 1d24ca1..6236a3b 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -59,6 +59,8 @@ def extract_fulltext_url(html_url, html_body): if not meta: # researchgate does this; maybe others also? meta = soup.find('meta', attrs={"property":"citation_pdf_url"}) + if not meta: + meta = soup.find('meta', attrs={"name":"eprints.document_url"}) # if tag is only partially populated if meta and not meta.get('content'): meta = None |