aboutsummaryrefslogtreecommitdiffstats
path: root/python/tests
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-08 21:54:24 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-08 21:54:24 -0800
commita68aadc4107fc68dc2748c52dab8a4bd92cca022 (patch)
treeda3da0a847d5c10dee873e8bce8198a39c12ce1f /python/tests
parent6a701f966b8bc760bf904c0569562b0159e13559 (diff)
downloadsandcrawler-a68aadc4107fc68dc2748c52dab8a4bd92cca022.tar.gz
sandcrawler-a68aadc4107fc68dc2748c52dab8a4bd92cca022.zip
move some PDF URL extraction into declarative format
Diffstat (limited to 'python/tests')
-rw-r--r--python/tests/test_html.py8
-rw-r--r--python/tests/test_html_metadata.py4
2 files changed, 3 insertions, 9 deletions
diff --git a/python/tests/test_html.py b/python/tests/test_html.py
index 7d58a39..9a81852 100644
--- a/python/tests/test_html.py
+++ b/python/tests/test_html.py
@@ -31,11 +31,3 @@ def test_extract_fulltext_url():
f.read(),
)
assert resp['pdf_url'] == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
-
- with open('tests/files/elife_article.html', 'rb') as f:
- resp = extract_fulltext_url(
- "https://elifesciences.org/articles/44753",
- f.read(),
- )
- assert resp['pdf_url'] == "https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvNDQ3NTMvZWxpZmUtNDQ3NTMtdjIucGRm/elife-44753-v2.pdf?_hash=CfyqOqVryCR4OjcMTfcdpeIWAGZznmh9jXksYKYChCw%3D"
-
diff --git a/python/tests/test_html_metadata.py b/python/tests/test_html_metadata.py
index b428b0d..bf26a98 100644
--- a/python/tests/test_html_metadata.py
+++ b/python/tests/test_html_metadata.py
@@ -39,6 +39,7 @@ def test_html_metadata_plos() -> None:
assert meta.publisher == "Public Library of Science"
assert meta.raw_references and "citation_title=Reticuloendotheliosis virus sequences within the genomes of field strains of fowlpox virus display variability;citation_author=P Singh;citation_author=W. M. Schnitzlein;citation_author=D. N. Tripathy;citation_journal_title=J. Virol;citation_volume=77;citation_number=77;citation_first_page=5855;citation_last_page=5862;citation_publication_date=2003;" in meta.raw_references
assert meta.release_type == "article-journal"
+ assert meta.pdf_fulltext_url == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
def test_html_metadata_elife() -> None:
@@ -46,7 +47,7 @@ def test_html_metadata_elife() -> None:
with open('tests/files/elife_article.html', 'r') as f:
elife_html = f.read()
- meta = html_extract_biblio("http://example.org", HTMLParser(elife_html))
+ meta = html_extract_biblio("https://elifesciences.org/articles/44753", HTMLParser(elife_html))
assert meta is not None
assert meta.title == "Parallel visual circuitry in a basal chordate"
assert meta.doi == "10.7554/eLife.44753"
@@ -63,6 +64,7 @@ def test_html_metadata_elife() -> None:
# 2019-04-18
assert meta.release_date == datetime.date(year=2019, month=4, day=18)
assert meta.publisher == "eLife Sciences Publications Limited"
+ assert meta.pdf_fulltext_url == "https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvNDQ3NTMvZWxpZmUtNDQ3NTMtdjIucGRm/elife-44753-v2.pdf?_hash=CfyqOqVryCR4OjcMTfcdpeIWAGZznmh9jXksYKYChCw%3D"
def test_html_metadata_peerj() -> None: