html: extract eprints PDF url (eg, ub.uni-heidelberg.de)

author: Bryan Newbold <bnewbold@archive.org> 2020-08-11 00:28:06 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-08-11 00:28:06 -0700
commit: 33cc50939619d1c30bdfa800aba2137397a7ee0d (patch)
tree: cbd1bef13d061a97f6ce14138447dc2d11d396c1
parent: 26d4766a2835aab00b0201198376d4ca42cc1d82 (diff)
download: sandcrawler-33cc50939619d1c30bdfa800aba2137397a7ee0d.tar.gz
sandcrawler-33cc50939619d1c30bdfa800aba2137397a7ee0d.zip
1 files changed, 2 insertions, 0 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 1d24ca1..6236a3b 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -59,6 +59,8 @@ def extract_fulltext_url(html_url, html_body):
     if not meta:
         # researchgate does this; maybe others also?
         meta = soup.find('meta', attrs={"property":"citation_pdf_url"})
+    if not meta:
+        meta = soup.find('meta', attrs={"name":"eprints.document_url"})
     # if tag is only partially populated
     if meta and not meta.get('content'):
         meta = None
author	Bryan Newbold <bnewbold@archive.org>	2020-08-11 00:28:06 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-08-11 00:28:06 -0700
commit	33cc50939619d1c30bdfa800aba2137397a7ee0d (patch)
tree	cbd1bef13d061a97f6ce14138447dc2d11d396c1
parent	26d4766a2835aab00b0201198376d4ca42cc1d82 (diff)
download	sandcrawler-33cc50939619d1c30bdfa800aba2137397a7ee0d.tar.gz sandcrawler-33cc50939619d1c30bdfa800aba2137397a7ee0d.zip