diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-01-08 18:24:48 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-09 16:30:02 -0800 |
commit | 7eb8f74bc15d1acb5771320ec4e2342d85077555 (patch) | |
tree | 88f21d3c3bc881761e7961d36e3ae3b896a1a982 /python/tests/test_html.py | |
parent | a6f2067e288fc235375af4fed12c5782a82856dc (diff) | |
download | sandcrawler-7eb8f74bc15d1acb5771320ec4e2342d85077555.tar.gz sandcrawler-7eb8f74bc15d1acb5771320ec4e2342d85077555.zip |
basic elife+plos extraction tests
Ripped out some HTML, but these could have been minimized even further
to keep repository from growing large.
Diffstat (limited to 'python/tests/test_html.py')
-rw-r--r-- | python/tests/test_html.py | 41 |
1 files changed, 41 insertions, 0 deletions
diff --git a/python/tests/test_html.py b/python/tests/test_html.py new file mode 100644 index 0000000..3b59883 --- /dev/null +++ b/python/tests/test_html.py @@ -0,0 +1,41 @@ + +import json +import pytest +import responses + +from sandcrawler.html import extract_fulltext_url + +def test_extract_fulltext_url(): + + resp = extract_fulltext_url("asdf", "asdf") + assert resp == {} + + resp = extract_fulltext_url( + "http://dummy-site/", + b"""<html> + <head> + <meta name="citation_pdf_url" content="http://www.example.com/content/271/20/11761.full.pdf"> + </head> + <body> + <h1>my big article here</h1> + blah + </body> + </html>""" + ) + assert resp['pdf_url'] == "http://www.example.com/content/271/20/11761.full.pdf" + assert resp['technique'] == "citation_pdf_url" + + with open('tests/files/plos_one_article.html', 'r') as f: + resp = extract_fulltext_url( + "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0213978", + f.read(), + ) + assert resp['pdf_url'] == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable" + + with open('tests/files/elife_article.html', 'r') as f: + resp = extract_fulltext_url( + "https://elifesciences.org/articles/44753", + f.read(), + ) + assert resp['pdf_url'] == "https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvNDQ3NTMvZWxpZmUtNDQ3NTMtdjIucGRm/elife-44753-v2.pdf?_hash=CfyqOqVryCR4OjcMTfcdpeIWAGZznmh9jXksYKYChCw%3D" + |