aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-24 11:50:49 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-24 11:50:49 -0800
commitc5b39c4323387e59fb53184711dd113f0483b42a (patch)
treebaa80423f51f34bb7c915a1199cff255437afd25
parent9ca0c0bc13082aa061eee3da6b057fbacb4052e9 (diff)
downloadsandcrawler-c5b39c4323387e59fb53184711dd113f0483b42a.tar.gz
sandcrawler-c5b39c4323387e59fb53184711dd113f0483b42a.zip
small tweak to americanarchivist.org URL extraction
-rw-r--r--python/sandcrawler/html.py2
1 files changed, 1 insertions, 1 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 091162d..34da876 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -223,7 +223,7 @@ def extract_fulltext_url(html_url, html_body):
# american archivist (OA)
# https://americanarchivist.org/doi/abs/10.17723/aarc.62.2.j475270470145630
- if "://americanarchivist.org/doi/abs/" in html_url:
+ if "://americanarchivist.org/doi/" in html_url and not "/doi/pdf" in html_url:
# <a href="/doi/pdf/10.17723/aarc.62.2.j475270470145630" target="_blank">
hrefs = soup.find_all('a', attrs={"target":"_blank"})
for href in hrefs: