diff options
-rw-r--r-- | python/sandcrawler/html.py | 2 |
1 files changed, 2 insertions, 0 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 1d24ca1..6236a3b 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -59,6 +59,8 @@ def extract_fulltext_url(html_url, html_body): if not meta: # researchgate does this; maybe others also? meta = soup.find('meta', attrs={"property":"citation_pdf_url"}) + if not meta: + meta = soup.find('meta', attrs={"name":"eprints.document_url"}) # if tag is only partially populated if meta and not meta.get('content'): meta = None |