python/tests/test_html_metadata.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229


import datetime
import pytest

from sandcrawler.html_metadata import *


def test_html_metadata_plos() -> None:

    with open('tests/files/plos_one_article.html', 'r') as f:
        plos_html = f.read()

    meta = html_extract_biblio("http://example.org", HTMLParser(plos_html))
    assert meta is not None
    assert meta.title == "Assessment on reticuloendotheliosis virus infection in specific-pathogen-free chickens based on detection of yolk antibody"
    assert meta.doi == "10.1371/journal.pone.0213978"
    assert meta.pdf_fulltext_url == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"
    assert meta.contrib_names == [
        "Yang Li",
        "Tuanjie Wang",
        "Lin Wang",
        "Mingjun Sun",
        "Zhizhong Cui",
        "Shuang Chang",
        "Yongping Wu",
        "Xiaodong Zhang",
        "Xiaohui Yu",
        "Tao Sun",
        "Peng Zhao",
    ]
    assert meta.container_name == "PLOS ONE"
    assert meta.container_abbrev == "PLOS ONE"
    # "Apr 22, 2019"
    assert meta.release_date == datetime.date(year=2019, month=4, day=22)
    assert meta.first_page == "e0213978"
    assert meta.issue == "4"
    assert meta.volume == "14"
    assert meta.container_issn == "1932-6203"
    assert meta.publisher == "Public Library of Science"
    assert meta.raw_references and "citation_title=Reticuloendotheliosis virus sequences within the genomes of field strains of fowlpox virus display variability;citation_author=P Singh;citation_author=W. M. Schnitzlein;citation_author=D. N. Tripathy;citation_journal_title=J. Virol;citation_volume=77;citation_number=77;citation_first_page=5855;citation_last_page=5862;citation_publication_date=2003;" in meta.raw_references
    assert meta.release_type == "article-journal"
    assert meta.pdf_fulltext_url == "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0213978&type=printable"


def test_html_metadata_elife() -> None:
    
    with open('tests/files/elife_article.html', 'r') as f:
        elife_html = f.read()

    meta = html_extract_biblio("https://elifesciences.org/articles/44753", HTMLParser(elife_html))
    assert meta is not None
    assert meta.title == "Parallel visual circuitry in a basal chordate"
    assert meta.doi == "10.7554/eLife.44753"
    assert meta.contrib_names == [
        "Matthew J Kourakis",
        "Cezar Borba",
        "Angela Zhang",
        "Erin Newman-Smith",
        "Priscilla Salas",
        "B Manjunath",
        "William C Smith",
    ]
    assert meta.container_name == "eLife"
    # 2019-04-18
    assert meta.release_date == datetime.date(year=2019, month=4, day=18)
    assert meta.publisher == "eLife Sciences Publications Limited"
    assert meta.pdf_fulltext_url == "https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvNDQ3NTMvZWxpZmUtNDQ3NTMtdjIucGRm/elife-44753-v2.pdf?_hash=CfyqOqVryCR4OjcMTfcdpeIWAGZznmh9jXksYKYChCw%3D"


def test_html_metadata_peerj() -> None:
 
    with open('tests/files/peerj_oa_article.html', 'r') as f:
        peerj_html = f.read()

    meta = html_extract_biblio("http://example.org", HTMLParser(peerj_html))
    assert meta is not None
    assert meta.title == "The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles"
    assert meta.doi == "10.7717/peerj.4375"
    assert meta.contrib_names == [
            "Heather Piwowar",
      "Jason Priem",
      "Vincent Larivière",
      "Juan Pablo Alperin",
      "Lisa Matthias",
      "Bree Norlander",
      "Ashley Farley",
      "Jevin West",
      "Stefanie Haustein",
    ]
    assert meta.container_name == "PeerJ"
    # "2018-02-13"
    assert meta.release_date == datetime.date(year=2018, month=2, day=13)
    assert meta.xml_fulltext_url and ".xml" in meta.xml_fulltext_url


def test_html_metadata_nature() -> None:

    with open('tests/files/nature_article.html', 'r') as f:
        nature_html = f.read()

    meta = html_extract_biblio("http://example.org", HTMLParser(nature_html))
    assert meta is not None
    assert meta.title == "More than 100 scientific journals have disappeared from the Internet"
    assert meta.doi == "10.1038/d41586-020-02610-z"
    assert meta.contrib_names == [
        "Diana Kwon",
    ]
    assert meta.container_name == "Nature"
    # "2020-09-10"
    assert meta.release_date == datetime.date(year=2020, month=9, day=10)
    assert meta.publisher == "Nature Publishing Group"
    # note: some error in dublin code in nature HTML resulting in duplication
    assert meta.abstract == "Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk.  Researchers have identified dozens of open-access journals that went offline between 2000 and 2019, and hundreds more that could be at risk."


def test_html_metadata_ojs3() -> None:

    with open('tests/files/first_monday_ojs3_landingpage.html', 'r') as f:
        ojs3_html = f.read()

    meta = html_extract_biblio("http://example.org", HTMLParser(ojs3_html))
    assert meta is not None
    assert meta.title == "Surveillance, stigma & sociotechnical design for HIV"
    assert meta.doi == "10.5210/fm.v25i10.10274"
    assert meta.contrib_names == [
        "Calvin Liang",
        "Jevan Alexander Hutson",
        "Os Keyes",
    ]
    assert meta.container_name == "First Monday"
    assert meta.container_abbrev == "1" # NOTE: bad source metadata
    assert meta.container_issn == "1396-0466"
    # "2020/09/10"
    assert meta.release_date == datetime.date(year=2020, month=9, day=10)
    assert meta.lang == "en"
    assert meta.abstract == "Online dating and hookup platforms have fundamentally changed people’s day-to-day practices of sex and love — but exist in tension with older social and medicolegal norms. This is particularly the case for people with HIV, who are frequently stigmatized, surveilled, ostracized, and incarcerated because of their status. Efforts to make intimate platforms “work” for HIV frequently focus on user-to-user interactions and disclosure of one’s HIV status but elide both the structural forces at work in regulating sex and the involvement of the state in queer lives. In an effort to foreground these forces and this involvement, we analyze the approaches that intimate platforms have taken in designing for HIV disclosure through a content analysis of 50 current platforms. We argue that the implicit reinforcement of stereotypes about who HIV is or is not a concern for, along with the failure to consider state practices when designing for data disclosure, opens up serious risks for HIV-positive and otherwise marginalized people. While we have no panacea for the tension between disclosure and risk, we point to bottom-up, communal, and queer approaches to design as a way of potentially making that tension easier to safely navigate."
    assert meta.html_fulltext_url == "https://firstmonday.org/ojs/index.php/fm/article/view/10274/9729"
    assert meta.release_type == "article-journal"


def test_html_metadata_dlib() -> None:

    with open('tests/files/dlib_05vanhyning.html', 'r') as f:
        dlib_html = f.read()

    meta = html_extract_biblio("http://example.org", HTMLParser(dlib_html))
    assert meta is not None
    assert meta.doi == "10.1045/may2017-vanhyning"
    # "2017-05-15"
    assert meta.release_date == datetime.date(year=2017, month=5, day=15)

def test_html_metadata_dc_case() -> None:
    """
    This tests that CSS selector <meta name=""> attribute lookups are not case-sensitive.
    """

    snippet = """
    <html>
    <head>
      <meta name="DC.Citation.Issue" content="123"/>
    </head>
    <body>Hi.</body>
    </html>"""

    meta = html_extract_biblio("http://example.org", HTMLParser(snippet))
    assert meta is not None
    assert meta.issue == "123"

@pytest.fixture
def adblock() -> Any:
    return load_adblock_rules()

def test_html_resources(adblock) -> None:

    with open('tests/files/dlib_05vanhyning.html', 'r') as f:
        dlib_html = f.read()

    resources = html_extract_resources(
        "http://www.dlib.org/dlib/may17/vanhyning/05vanhyning.html",
        HTMLParser(dlib_html),
        adblock,
    )

    assert dict(url="http://www.dlib.org/style/style1.css", type="stylesheet") in resources

    # check that adblock working
    for r in resources:
        assert '/ga.js' not in r['url']

    with open('tests/files/plos_one_article.html', 'r') as f:
        plos_html = f.read()

    resources = html_extract_resources(
        "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0213978",
        HTMLParser(plos_html),
        adblock,
    )

    # check that custom adblock working
    for r in resources:
        assert 'crossmark-cdn.crossref.org' not in r['url']

    with open('tests/files/first_monday_ojs3_landingpage.html', 'r') as f:
        monday_html = f.read()

    resources = html_extract_resources(
        "https://firstmonday.org/blah/",
        HTMLParser(monday_html),
        adblock,
    )

    with open('tests/files/elife_article.html', 'r') as f:
        elife_html = f.read()

    resources = html_extract_resources(
        "https://elife.org/blah/",
        HTMLParser(elife_html),
        adblock,
    )

    with open('tests/files/nature_article.html', 'r') as f:
        nature_html = f.read()

    resources = html_extract_resources(
        "https://nature.com/blah/",
        HTMLParser(nature_html),
        adblock,
    )