aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-07-16 13:07:38 -0700
committerBryan Newbold <bnewbold@archive.org>2022-07-16 13:07:38 -0700
commitab8f4b0f957fa020f94fbb373e4d41f3cbb94293 (patch)
treeef8f15c58ebecad53494447528d2115d20ffe75f
parent468a7b6f91c5b3b0a80e3c82c408b8e02ce71e13 (diff)
downloadsandcrawler-ab8f4b0f957fa020f94fbb373e4d41f3cbb94293.tar.gz
sandcrawler-ab8f4b0f957fa020f94fbb373e4d41f3cbb94293.zip
HTML: no longer extracting citation_pdf_url in main extract function
-rw-r--r--python/tests/test_html.py24
1 files changed, 0 insertions, 24 deletions
diff --git a/python/tests/test_html.py b/python/tests/test_html.py
index 614b802..0f951eb 100644
--- a/python/tests/test_html.py
+++ b/python/tests/test_html.py
@@ -6,27 +6,3 @@ def test_extract_fulltext_url():
resp = extract_fulltext_url("asdf", b"asdf")
assert resp == {}
- resp = extract_fulltext_url(
- "http://dummy-site/",
- b"""<html>
- <head>
- <meta name="citation_pdf_url" content="http://www.example.com/content/271/20/11761.full.pdf">
- </head>
- <body>
- <h1>my big article here</h1>
- blah
- </body>
- </html>""",
- )
- assert resp["pdf_url"] == "http://www.example.com/content/271/20/11761.full.pdf"
- assert resp["technique"] == "citation_pdf_url"
-
- with open("tests/files/plos_one_article.html", "rb") as f:
- resp = extract_fulltext_url(
- "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0213978",
- f.read(),
- )
- assert (
- resp["pdf_url"]
- == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
- )