aboutsummaryrefslogtreecommitdiffstats
path: root/tests/test_parse.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-29 10:48:00 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-29 10:48:06 -0700
commit2c4aba8dbc0d548f0007000c9297a39d4ab67fe2 (patch)
treee9901b207febfbabb634582de697cdc2aa73504b /tests/test_parse.py
parent38b758e043eef8a48eba7f1e5bf385d91b375e5e (diff)
downloadgrobid_tei_xml-2c4aba8dbc0d548f0007000c9297a39d4ab67fe2.tar.gz
grobid_tei_xml-2c4aba8dbc0d548f0007000c9297a39d4ab67fe2.zip
add UTF-8 processCitation tests
Diffstat (limited to 'tests/test_parse.py')
-rw-r--r--tests/test_parse.py48
1 files changed, 48 insertions, 0 deletions
diff --git a/tests/test_parse.py b/tests/test_parse.py
index 70dcc98..5f64a48 100644
--- a/tests/test_parse.py
+++ b/tests/test_parse.py
@@ -338,3 +338,51 @@ def test_empty_citations() -> None:
assert d2
assert d2[0].index == 0
assert d2[0].unstructured == "blah"
+
+
+def test_citation_emdash() -> None:
+
+ with open("tests/files/citation_emdash.tei.xml", "rb") as f:
+ tei_xml_bytes = f.read()
+ with open("tests/files/citation_emdash.tei.xml", "r") as f2:
+ tei_xml_str = f2.read()
+
+ # that dash is a unicode emdash
+ unstructured = "Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155–172). Bobs-Merrill company."
+ assert unstructured[70:81] == "pp. 155\u2013172"
+ assert "pp. 155\u2013172".encode("utf-8") in tei_xml_bytes
+ assert "pp. 155\u2013172" in tei_xml_str
+
+ ref_bytes = parse_citation_xml(tei_xml_bytes)
+ assert ref_bytes
+ assert ref_bytes.unstructured == unstructured
+ assert ref_bytes.first_page == "155"
+ assert ref_bytes.pages == "155-172"
+
+ ref_str = parse_citation_xml(tei_xml_str)
+ assert ref_str
+ assert ref_str.unstructured == unstructured
+ assert ref_str.first_page == "155"
+ assert ref_str.pages == "155-172"
+
+
+def test_citation_list_utf8() -> None:
+ with open("tests/files/citation_list_emdash.tei.xml", "rb") as f:
+ tei_xml_bytes = f.read()
+ with open("tests/files/citation_list_emdash.tei.xml", "r") as f2:
+ tei_xml_str = f2.read()
+
+ unstructured = "Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155–172). Bobs-Merrill company."
+ assert unstructured[70:81] == "pp. 155\u2013172"
+
+ ref_bytes = parse_citation_list_xml(tei_xml_bytes)[0]
+ assert ref_bytes
+ assert ref_bytes.unstructured == unstructured
+ assert ref_bytes.first_page == "155"
+ assert ref_bytes.pages == "155-172"
+
+ ref_str = parse_citation_list_xml(tei_xml_str)[0]
+ assert ref_str
+ assert ref_str.unstructured == unstructured
+ assert ref_str.first_page == "155"
+ assert ref_str.pages == "155-172"