From cfe04b60a28e90c694d2558eadb71c0b280c40e8 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 24 Feb 2020 12:10:28 -0800 Subject: ingest: make ehp.niehs.nih.gov rule more robust --- python/sandcrawler/html.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'python') diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 34da876..7189055 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -270,10 +270,11 @@ def extract_fulltext_url(html_url, html_body): # ehp.niehs.nih.gov # https://ehp.niehs.nih.gov/doi/full/10.1289/EHP4709 - if "://ehp.niehs.nih.gov/doi/full/" in html_url: + # https://ehp.niehs.nih.gov/doi/10.1289/ehp.113-a51 + if "://ehp.niehs.nih.gov/doi/" in html_url: # if b'/doi/pdf/10.' in html_body: - url = html_url.replace('/doi/full/10.', '/doi/pdf/10.') + url = html_url.replace('/doi/full/10.', '/doi/pdf/10.').replace('/doi/10.', '/doi/pdf/10.') return dict(pdf_url=url, technique='ehp.niehs.nigh.gov-url') # journals.tsu.ru (and maybe others) -- cgit v1.2.3