html: more metadata tests

author: Bryan Newbold <bnewbold@archive.org> 2020-10-29 14:31:21 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-10-29 14:31:21 -0700
commit: 3d56509ef83226a808ebb078f5cac9815afb5d9d (patch)
tree: 2be006c040b3e7ba76c00702fbe6cb513da3451f /python/tests/test_html_metadata.py
parent: fb98ca7bdeca9bda84d86ac4a3c65661b9542264 (diff)
download: sandcrawler-3d56509ef83226a808ebb078f5cac9815afb5d9d.tar.gz
sandcrawler-3d56509ef83226a808ebb078f5cac9815afb5d9d.zip
1 files changed, 88 insertions, 0 deletions
diff --git a/python/tests/test_html_metadata.py b/python/tests/test_html_metadata.py
index 4d670e5..597520c 100644
--- a/python/tests/test_html_metadata.py
+++ b/python/tests/test_html_metadata.py
@@ -1,5 +1,6 @@
 
 import datetime
+import pytest
 
 from sandcrawler.html_metadata import *
 
@@ -64,6 +65,31 @@ def test_html_metadata_elife() -> None:
     assert meta.publisher == "eLife Sciences Publications Limited"
 
 
+def test_html_metadata_peerj() -> None:
+ 
+    with open('tests/files/peerj_oa_article.html', 'r') as f:
+        peerj_html = f.read()
+
+    meta = html_extract_biblio(HTMLParser(peerj_html))
+    assert meta is not None
+    assert meta.title == "The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles"
+    assert meta.doi == "10.7717/peerj.4375"
+    assert meta.contrib_names == [
+            "Heather Piwowar",
+      "Jason Priem",
+      "Vincent Larivière",
+      "Juan Pablo Alperin",
+      "Lisa Matthias",
+      "Bree Norlander",
+      "Ashley Farley",
+      "Jevin West",
+      "Stefanie Haustein",
+    ]
+    assert meta.container_name == "PeerJ"
+    # "2018-02-13"
+    assert meta.release_date == datetime.date(year=2018, month=2, day=13)
+
+
 def test_html_metadata_nature() -> None:
 
     with open('tests/files/nature_article.html', 'r') as f:
@@ -136,3 +162,65 @@ def test_html_metadata_dc_case() -> None:
     meta = html_extract_biblio(HTMLParser(snippet))
     assert meta is not None
     assert meta.issue == "123"
+
+@pytest.fixture
+def adblock() -> Any:
+    return load_adblock_rules()
+
+def test_html_resources(adblock) -> None:
+
+    with open('tests/files/dlib_05vanhyning.html', 'r') as f:
+        dlib_html = f.read()
+
+    resources = html_extract_resources(
+        "http://www.dlib.org/dlib/may17/vanhyning/05vanhyning.html",
+        HTMLParser(dlib_html),
+        adblock,
+    )
+
+    assert dict(url="http://www.dlib.org/style/style1.css", type="stylesheet") in resources
+
+    # check that adblock working
+    for r in resources:
+        assert '/ga.js' not in r['url']
+
+    with open('tests/files/plos_one_article.html', 'r') as f:
+        plos_html = f.read()
+
+    resources = html_extract_resources(
+        "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0213978",
+        HTMLParser(plos_html),
+        adblock,
+    )
+
+    # check that custom adblock working
+    for r in resources:
+        assert 'crossmark-cdn.crossref.org' not in r['url']
+
+    with open('tests/files/first_monday_ojs3_landingpage.html', 'r') as f:
+        monday_html = f.read()
+
+    resources = html_extract_resources(
+        "https://firstmonday.org/blah/",
+        HTMLParser(monday_html),
+        adblock,
+    )
+
+    with open('tests/files/elife_article.html', 'r') as f:
+        elife_html = f.read()
+
+    resources = html_extract_resources(
+        "https://elife.org/blah/",
+        HTMLParser(elife_html),
+        adblock,
+    )
+
+    with open('tests/files/nature_article.html', 'r') as f:
+        nature_html = f.read()
+
+    resources = html_extract_resources(
+        "https://nature.com/blah/",
+        HTMLParser(nature_html),
+        adblock,
+    )
+
author	Bryan Newbold <bnewbold@archive.org>	2020-10-29 14:31:21 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-10-29 14:31:21 -0700
commit	3d56509ef83226a808ebb078f5cac9815afb5d9d (patch)
tree	2be006c040b3e7ba76c00702fbe6cb513da3451f /python/tests/test_html_metadata.py
parent	fb98ca7bdeca9bda84d86ac4a3c65661b9542264 (diff)
download	sandcrawler-3d56509ef83226a808ebb078f5cac9815afb5d9d.tar.gz sandcrawler-3d56509ef83226a808ebb078f5cac9815afb5d9d.zip