diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-02-24 12:10:28 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-24 12:10:28 -0800 |
commit | cfe04b60a28e90c694d2558eadb71c0b280c40e8 (patch) | |
tree | 7a7f053f11b98997e757d3c6025bbc517cc9feb4 | |
parent | c5b39c4323387e59fb53184711dd113f0483b42a (diff) | |
download | sandcrawler-cfe04b60a28e90c694d2558eadb71c0b280c40e8.tar.gz sandcrawler-cfe04b60a28e90c694d2558eadb71c0b280c40e8.zip |
ingest: make ehp.niehs.nih.gov rule more robust
-rw-r--r-- | python/sandcrawler/html.py | 5 |
1 files changed, 3 insertions, 2 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 34da876..7189055 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -270,10 +270,11 @@ def extract_fulltext_url(html_url, html_body): # ehp.niehs.nih.gov # https://ehp.niehs.nih.gov/doi/full/10.1289/EHP4709 - if "://ehp.niehs.nih.gov/doi/full/" in html_url: + # https://ehp.niehs.nih.gov/doi/10.1289/ehp.113-a51 + if "://ehp.niehs.nih.gov/doi/" in html_url: # <a href="/doi/pdf/10.1289/EHP4709" target="_blank"> if b'/doi/pdf/10.' in html_body: - url = html_url.replace('/doi/full/10.', '/doi/pdf/10.') + url = html_url.replace('/doi/full/10.', '/doi/pdf/10.').replace('/doi/10.', '/doi/pdf/10.') return dict(pdf_url=url, technique='ehp.niehs.nigh.gov-url') # journals.tsu.ru (and maybe others) |