aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/html.py4
1 files changed, 4 insertions, 0 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 7189055..e993e74 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -224,6 +224,10 @@ def extract_fulltext_url(html_url, html_body):
# american archivist (OA)
# https://americanarchivist.org/doi/abs/10.17723/aarc.62.2.j475270470145630
if "://americanarchivist.org/doi/" in html_url and not "/doi/pdf" in html_url:
+ # use a more aggressive direct guess to avoid rate-limiting...
+ if "/doi/10." in html_url:
+ url = html_url.replace("/doi/10.", "/doi/pdf/10.")
+ return dict(pdf_url=url, technique='archivist-url')
# <a href="/doi/pdf/10.17723/aarc.62.2.j475270470145630" target="_blank">
hrefs = soup.find_all('a', attrs={"target":"_blank"})
for href in hrefs: