aboutsummaryrefslogtreecommitdiffstats
path: root/tests/test_parse.py
diff options
context:
space:
mode:
Diffstat (limited to 'tests/test_parse.py')
-rw-r--r--tests/test_parse.py48
1 files changed, 48 insertions, 0 deletions
diff --git a/tests/test_parse.py b/tests/test_parse.py
index 70dcc98..5f64a48 100644
--- a/tests/test_parse.py
+++ b/tests/test_parse.py
@@ -338,3 +338,51 @@ def test_empty_citations() -> None:
assert d2
assert d2[0].index == 0
assert d2[0].unstructured == "blah"
+
+
+def test_citation_emdash() -> None:
+
+ with open("tests/files/citation_emdash.tei.xml", "rb") as f:
+ tei_xml_bytes = f.read()
+ with open("tests/files/citation_emdash.tei.xml", "r") as f2:
+ tei_xml_str = f2.read()
+
+ # that dash is a unicode emdash
+ unstructured = "Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155–172). Bobs-Merrill company."
+ assert unstructured[70:81] == "pp. 155\u2013172"
+ assert "pp. 155\u2013172".encode("utf-8") in tei_xml_bytes
+ assert "pp. 155\u2013172" in tei_xml_str
+
+ ref_bytes = parse_citation_xml(tei_xml_bytes)
+ assert ref_bytes
+ assert ref_bytes.unstructured == unstructured
+ assert ref_bytes.first_page == "155"
+ assert ref_bytes.pages == "155-172"
+
+ ref_str = parse_citation_xml(tei_xml_str)
+ assert ref_str
+ assert ref_str.unstructured == unstructured
+ assert ref_str.first_page == "155"
+ assert ref_str.pages == "155-172"
+
+
+def test_citation_list_utf8() -> None:
+ with open("tests/files/citation_list_emdash.tei.xml", "rb") as f:
+ tei_xml_bytes = f.read()
+ with open("tests/files/citation_list_emdash.tei.xml", "r") as f2:
+ tei_xml_str = f2.read()
+
+ unstructured = "Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155–172). Bobs-Merrill company."
+ assert unstructured[70:81] == "pp. 155\u2013172"
+
+ ref_bytes = parse_citation_list_xml(tei_xml_bytes)[0]
+ assert ref_bytes
+ assert ref_bytes.unstructured == unstructured
+ assert ref_bytes.first_page == "155"
+ assert ref_bytes.pages == "155-172"
+
+ ref_str = parse_citation_list_xml(tei_xml_str)[0]
+ assert ref_str
+ assert ref_str.unstructured == unstructured
+ assert ref_str.first_page == "155"
+ assert ref_str.pages == "155-172"