diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-01-10 17:07:18 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-10 17:07:18 -0800 |
commit | f916655ab949ee11b3aa6bc84bb3b2118b0748d0 (patch) | |
tree | 2ca442217f4638bf59337f594b4565f97d369f8a /python | |
parent | bdb4a63ae43c9c292611816a8f74fe7bd00e8a9c (diff) | |
download | sandcrawler-f916655ab949ee11b3aa6bc84bb3b2118b0748d0.tar.gz sandcrawler-f916655ab949ee11b3aa6bc84bb3b2118b0748d0.zip |
html extract: protocols.io, fix americanarchivist
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/html.py | 8 |
1 files changed, 7 insertions, 1 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 93eb28e..e6f0f69 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -215,7 +215,7 @@ def extract_fulltext_url(html_url, html_body): # https://americanarchivist.org/doi/abs/10.17723/aarc.62.2.j475270470145630 if "://americanarchivist.org/doi/abs/" in html_url: # <a href="/doi/pdf/10.17723/aarc.62.2.j475270470145630" target="_blank"> - hrefs = soup.find_all('a', attrs={"_target":"blank"}) + hrefs = soup.find_all('a', attrs={"target":"_blank"}) for href in hrefs: url = href['href'].strip() if "/doi/pdf/" in url: @@ -224,4 +224,10 @@ def extract_fulltext_url(html_url, html_body): elif url.startswith('/'): return dict(pdf_url=host_prefix+url, technique='publisher-href') + # protocols.io + # https://www.protocols.io/view/flow-cytometry-protocol-mgdc3s6 + if "://www.protocols.io/view/" in html_url and not html_url.endswith(".pdf"): + url = html_url + ".pdf" + return dict(pdf_url=url, technique='protocolsio-url') + return dict() |