diff options
Diffstat (limited to 'tests/test_parse.py')
-rw-r--r-- | tests/test_parse.py | 48 |
1 files changed, 48 insertions, 0 deletions
diff --git a/tests/test_parse.py b/tests/test_parse.py index 70dcc98..5f64a48 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -338,3 +338,51 @@ def test_empty_citations() -> None: assert d2 assert d2[0].index == 0 assert d2[0].unstructured == "blah" + + +def test_citation_emdash() -> None: + + with open("tests/files/citation_emdash.tei.xml", "rb") as f: + tei_xml_bytes = f.read() + with open("tests/files/citation_emdash.tei.xml", "r") as f2: + tei_xml_str = f2.read() + + # that dash is a unicode emdash + unstructured = "Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155–172). Bobs-Merrill company." + assert unstructured[70:81] == "pp. 155\u2013172" + assert "pp. 155\u2013172".encode("utf-8") in tei_xml_bytes + assert "pp. 155\u2013172" in tei_xml_str + + ref_bytes = parse_citation_xml(tei_xml_bytes) + assert ref_bytes + assert ref_bytes.unstructured == unstructured + assert ref_bytes.first_page == "155" + assert ref_bytes.pages == "155-172" + + ref_str = parse_citation_xml(tei_xml_str) + assert ref_str + assert ref_str.unstructured == unstructured + assert ref_str.first_page == "155" + assert ref_str.pages == "155-172" + + +def test_citation_list_utf8() -> None: + with open("tests/files/citation_list_emdash.tei.xml", "rb") as f: + tei_xml_bytes = f.read() + with open("tests/files/citation_list_emdash.tei.xml", "r") as f2: + tei_xml_str = f2.read() + + unstructured = "Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155–172). Bobs-Merrill company." + assert unstructured[70:81] == "pp. 155\u2013172" + + ref_bytes = parse_citation_list_xml(tei_xml_bytes)[0] + assert ref_bytes + assert ref_bytes.unstructured == unstructured + assert ref_bytes.first_page == "155" + assert ref_bytes.pages == "155-172" + + ref_str = parse_citation_list_xml(tei_xml_str)[0] + assert ref_str + assert ref_str.unstructured == unstructured + assert ref_str.first_page == "155" + assert ref_str.pages == "155-172" |