From 266b0a2d5928921d3b3f992fa249b22f7a5edb16 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 24 Feb 2020 13:21:57 -0800 Subject: ingest: more direct americanarchivist PDF url guess --- python/sandcrawler/html.py | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'python') diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 7189055..e993e74 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -224,6 +224,10 @@ def extract_fulltext_url(html_url, html_body): # american archivist (OA) # https://americanarchivist.org/doi/abs/10.17723/aarc.62.2.j475270470145630 if "://americanarchivist.org/doi/" in html_url and not "/doi/pdf" in html_url: + # use a more aggressive direct guess to avoid rate-limiting... + if "/doi/10." in html_url: + url = html_url.replace("/doi/10.", "/doi/pdf/10.") + return dict(pdf_url=url, technique='archivist-url') # hrefs = soup.find_all('a', attrs={"target":"_blank"}) for href in hrefs: -- cgit v1.2.3