diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-07-16 13:07:38 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-07-16 13:07:38 -0700 |
commit | ab8f4b0f957fa020f94fbb373e4d41f3cbb94293 (patch) | |
tree | ef8f15c58ebecad53494447528d2115d20ffe75f /python | |
parent | 468a7b6f91c5b3b0a80e3c82c408b8e02ce71e13 (diff) | |
download | sandcrawler-ab8f4b0f957fa020f94fbb373e4d41f3cbb94293.tar.gz sandcrawler-ab8f4b0f957fa020f94fbb373e4d41f3cbb94293.zip |
HTML: no longer extracting citation_pdf_url in main extract function
Diffstat (limited to 'python')
-rw-r--r-- | python/tests/test_html.py | 24 |
1 files changed, 0 insertions, 24 deletions
diff --git a/python/tests/test_html.py b/python/tests/test_html.py index 614b802..0f951eb 100644 --- a/python/tests/test_html.py +++ b/python/tests/test_html.py @@ -6,27 +6,3 @@ def test_extract_fulltext_url(): resp = extract_fulltext_url("asdf", b"asdf") assert resp == {} - resp = extract_fulltext_url( - "http://dummy-site/", - b"""<html> - <head> - <meta name="citation_pdf_url" content="http://www.example.com/content/271/20/11761.full.pdf"> - </head> - <body> - <h1>my big article here</h1> - blah - </body> - </html>""", - ) - assert resp["pdf_url"] == "http://www.example.com/content/271/20/11761.full.pdf" - assert resp["technique"] == "citation_pdf_url" - - with open("tests/files/plos_one_article.html", "rb") as f: - resp = extract_fulltext_url( - "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0213978", - f.read(), - ) - assert ( - resp["pdf_url"] - == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable" - ) |