aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-24 12:10:28 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-24 12:10:28 -0800
commitcfe04b60a28e90c694d2558eadb71c0b280c40e8 (patch)
tree7a7f053f11b98997e757d3c6025bbc517cc9feb4
parentc5b39c4323387e59fb53184711dd113f0483b42a (diff)
downloadsandcrawler-cfe04b60a28e90c694d2558eadb71c0b280c40e8.tar.gz
sandcrawler-cfe04b60a28e90c694d2558eadb71c0b280c40e8.zip
ingest: make ehp.niehs.nih.gov rule more robust
-rw-r--r--python/sandcrawler/html.py5
1 files changed, 3 insertions, 2 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 34da876..7189055 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -270,10 +270,11 @@ def extract_fulltext_url(html_url, html_body):
# ehp.niehs.nih.gov
# https://ehp.niehs.nih.gov/doi/full/10.1289/EHP4709
- if "://ehp.niehs.nih.gov/doi/full/" in html_url:
+ # https://ehp.niehs.nih.gov/doi/10.1289/ehp.113-a51
+ if "://ehp.niehs.nih.gov/doi/" in html_url:
# <a href="/doi/pdf/10.1289/EHP4709" target="_blank">
if b'/doi/pdf/10.' in html_body:
- url = html_url.replace('/doi/full/10.', '/doi/pdf/10.')
+ url = html_url.replace('/doi/full/10.', '/doi/pdf/10.').replace('/doi/10.', '/doi/pdf/10.')
return dict(pdf_url=url, technique='ehp.niehs.nigh.gov-url')
# journals.tsu.ru (and maybe others)