html extract: protocols.io, fix americanarchivist

author: Bryan Newbold <bnewbold@archive.org> 2020-01-10 17:07:18 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2020-01-10 17:07:18 -0800
commit: f916655ab949ee11b3aa6bc84bb3b2118b0748d0 (patch)
tree: 2ca442217f4638bf59337f594b4565f97d369f8a /python
parent: bdb4a63ae43c9c292611816a8f74fe7bd00e8a9c (diff)
download: sandcrawler-f916655ab949ee11b3aa6bc84bb3b2118b0748d0.tar.gz
sandcrawler-f916655ab949ee11b3aa6bc84bb3b2118b0748d0.zip
1 files changed, 7 insertions, 1 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 93eb28e..e6f0f69 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -215,7 +215,7 @@ def extract_fulltext_url(html_url, html_body):
     # https://americanarchivist.org/doi/abs/10.17723/aarc.62.2.j475270470145630
     if "://americanarchivist.org/doi/abs/" in html_url:
         # <a href="/doi/pdf/10.17723/aarc.62.2.j475270470145630" target="_blank">
-        hrefs = soup.find_all('a', attrs={"_target":"blank"})
+        hrefs = soup.find_all('a', attrs={"target":"_blank"})
         for href in hrefs:
             url = href['href'].strip()
             if "/doi/pdf/" in url:
@@ -224,4 +224,10 @@ def extract_fulltext_url(html_url, html_body):
                 elif url.startswith('/'):
                     return dict(pdf_url=host_prefix+url, technique='publisher-href')
 
+    # protocols.io
+    # https://www.protocols.io/view/flow-cytometry-protocol-mgdc3s6
+    if "://www.protocols.io/view/" in html_url and not html_url.endswith(".pdf"):
+        url = html_url + ".pdf"
+        return dict(pdf_url=url, technique='protocolsio-url')
+
     return dict()
author	Bryan Newbold <bnewbold@archive.org>	2020-01-10 17:07:18 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2020-01-10 17:07:18 -0800
commit	f916655ab949ee11b3aa6bc84bb3b2118b0748d0 (patch)
tree	2ca442217f4638bf59337f594b4565f97d369f8a /python
parent	bdb4a63ae43c9c292611816a8f74fe7bd00e8a9c (diff)
download	sandcrawler-f916655ab949ee11b3aa6bc84bb3b2118b0748d0.tar.gz sandcrawler-f916655ab949ee11b3aa6bc84bb3b2118b0748d0.zip