aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/sandcrawler/html.py5
1 files changed, 3 insertions, 2 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 34da876..7189055 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -270,10 +270,11 @@ def extract_fulltext_url(html_url, html_body):
# ehp.niehs.nih.gov
# https://ehp.niehs.nih.gov/doi/full/10.1289/EHP4709
- if "://ehp.niehs.nih.gov/doi/full/" in html_url:
+ # https://ehp.niehs.nih.gov/doi/10.1289/ehp.113-a51
+ if "://ehp.niehs.nih.gov/doi/" in html_url:
# <a href="/doi/pdf/10.1289/EHP4709" target="_blank">
if b'/doi/pdf/10.' in html_body:
- url = html_url.replace('/doi/full/10.', '/doi/pdf/10.')
+ url = html_url.replace('/doi/full/10.', '/doi/pdf/10.').replace('/doi/10.', '/doi/pdf/10.')
return dict(pdf_url=url, technique='ehp.niehs.nigh.gov-url')
# journals.tsu.ru (and maybe others)