From 2c4aba8dbc0d548f0007000c9297a39d4ab67fe2 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 29 Oct 2021 10:48:00 -0700 Subject: add UTF-8 processCitation tests --- tests/files/citation_emdash.tei.xml | 20 ++++++++++ tests/files/citation_list_emdash.tei.xml | 66 ++++++++++++++++++++++++++++++++ tests/test_parse.py | 48 +++++++++++++++++++++++ 3 files changed, 134 insertions(+) create mode 100644 tests/files/citation_emdash.tei.xml create mode 100644 tests/files/citation_list_emdash.tei.xml diff --git a/tests/files/citation_emdash.tei.xml b/tests/files/citation_emdash.tei.xml new file mode 100644 index 0000000..846085c --- /dev/null +++ b/tests/files/citation_emdash.tei.xml @@ -0,0 +1,20 @@ + + + A world of individuals + + + N + Goodman + + + + + Problems and projects + + 1972 + + + + Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155–172). Bobs-Merrill company. + + diff --git a/tests/files/citation_list_emdash.tei.xml b/tests/files/citation_list_emdash.tei.xml new file mode 100644 index 0000000..b47f85b --- /dev/null +++ b/tests/files/citation_list_emdash.tei.xml @@ -0,0 +1,66 @@ + + + + + + +
+ + + + A world of individuals + + NGoodman + + + + Problems and projects + + 1972 + + + + Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155–172). Bobs-Merrill company. + + + + + Implicit definition sustained + + WV OQuine + + + + The ways of paradox and other essays +
Cambridge, MA
+ + Harvard University Press + 1976b + + +
+ Quine, W. V. O. (1976b). Implicit definition sustained. In The ways of paradox and other essays (2. enlarged and revised ed., pp. 133–136). Cambridge, MA: Harvard University Press. +
+ + + + On some difficulties in the theory of transfinite numbers and order types + + BRussell + + + 1906 + Proceedings of London Mathematical Society + 4 + + + + Russell, B. (1906). On some difficulties in the theory of transfinite numbers and order types. Proceedings of London Mathematical Society, 4, 29–53. + + +
+
+
+
+
diff --git a/tests/test_parse.py b/tests/test_parse.py index 70dcc98..5f64a48 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -338,3 +338,51 @@ def test_empty_citations() -> None: assert d2 assert d2[0].index == 0 assert d2[0].unstructured == "blah" + + +def test_citation_emdash() -> None: + + with open("tests/files/citation_emdash.tei.xml", "rb") as f: + tei_xml_bytes = f.read() + with open("tests/files/citation_emdash.tei.xml", "r") as f2: + tei_xml_str = f2.read() + + # that dash is a unicode emdash + unstructured = "Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155–172). Bobs-Merrill company." + assert unstructured[70:81] == "pp. 155\u2013172" + assert "pp. 155\u2013172".encode("utf-8") in tei_xml_bytes + assert "pp. 155\u2013172" in tei_xml_str + + ref_bytes = parse_citation_xml(tei_xml_bytes) + assert ref_bytes + assert ref_bytes.unstructured == unstructured + assert ref_bytes.first_page == "155" + assert ref_bytes.pages == "155-172" + + ref_str = parse_citation_xml(tei_xml_str) + assert ref_str + assert ref_str.unstructured == unstructured + assert ref_str.first_page == "155" + assert ref_str.pages == "155-172" + + +def test_citation_list_utf8() -> None: + with open("tests/files/citation_list_emdash.tei.xml", "rb") as f: + tei_xml_bytes = f.read() + with open("tests/files/citation_list_emdash.tei.xml", "r") as f2: + tei_xml_str = f2.read() + + unstructured = "Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155–172). Bobs-Merrill company." + assert unstructured[70:81] == "pp. 155\u2013172" + + ref_bytes = parse_citation_list_xml(tei_xml_bytes)[0] + assert ref_bytes + assert ref_bytes.unstructured == unstructured + assert ref_bytes.first_page == "155" + assert ref_bytes.pages == "155-172" + + ref_str = parse_citation_list_xml(tei_xml_str)[0] + assert ref_str + assert ref_str.unstructured == unstructured + assert ref_str.first_page == "155" + assert ref_str.pages == "155-172" -- cgit v1.2.3