From c5b39c4323387e59fb53184711dd113f0483b42a Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Mon, 24 Feb 2020 11:50:49 -0800
Subject: small tweak to americanarchivist.org URL extraction

---
 python/sandcrawler/html.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'python')
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 091162d..34da876 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -223,7 +223,7 @@ def extract_fulltext_url(html_url, html_body):
 
     # american archivist (OA)
     # https://americanarchivist.org/doi/abs/10.17723/aarc.62.2.j475270470145630
-    if "://americanarchivist.org/doi/abs/" in html_url:
+    if "://americanarchivist.org/doi/" in html_url and not "/doi/pdf" in html_url:
         # <a href="/doi/pdf/10.17723/aarc.62.2.j475270470145630" target="_blank">
         hrefs = soup.find_all('a', attrs={"target":"_blank"})
         for href in hrefs:
-- 
cgit v1.2.3