summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-29 10:48:00 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-29 10:48:06 -0700
commit2c4aba8dbc0d548f0007000c9297a39d4ab67fe2 (patch)
treee9901b207febfbabb634582de697cdc2aa73504b
parent38b758e043eef8a48eba7f1e5bf385d91b375e5e (diff)
downloadgrobid_tei_xml-2c4aba8dbc0d548f0007000c9297a39d4ab67fe2.tar.gz
grobid_tei_xml-2c4aba8dbc0d548f0007000c9297a39d4ab67fe2.zip
add UTF-8 processCitation tests
-rw-r--r--tests/files/citation_emdash.tei.xml20
-rw-r--r--tests/files/citation_list_emdash.tei.xml66
-rw-r--r--tests/test_parse.py48
3 files changed, 134 insertions, 0 deletions
diff --git a/tests/files/citation_emdash.tei.xml b/tests/files/citation_emdash.tei.xml
new file mode 100644
index 0000000..846085c
--- /dev/null
+++ b/tests/files/citation_emdash.tei.xml
@@ -0,0 +1,20 @@
+<biblStruct >
+ <analytic>
+ <title level="a" type="main">A world of individuals</title>
+ <author>
+ <persName>
+ <forename type="first">N</forename>
+ <surname>Goodman</surname>
+ </persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Problems and projects</title>
+ <imprint>
+ <date type="published" when="1972">1972</date>
+ <biblScope unit="page" from="155" to="172" />
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155–172). Bobs-Merrill company.</note>
+</biblStruct>
+
diff --git a/tests/files/citation_list_emdash.tei.xml b/tests/files/citation_list_emdash.tei.xml
new file mode 100644
index 0000000..b47f85b
--- /dev/null
+++ b/tests/files/citation_list_emdash.tei.xml
@@ -0,0 +1,66 @@
+<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:xlink="http://www.w3.org/1999/xlink"
+ xmlns:mml="http://www.w3.org/1998/Math/MathML">
+ <teiHeader/>
+ <text>
+ <front/>
+ <body/>
+ <back>
+ <div>
+ <listBibl>
+<biblStruct xml:id="b0">
+ <analytic>
+ <title level="a" type="main">A world of individuals</title>
+ <author>
+ <persName><forename type="first">N</forename><surname>Goodman</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Problems and projects</title>
+ <imprint>
+ <date type="published" when="1972">1972</date>
+ <biblScope unit="page" from="155" to="172" />
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155–172). Bobs-Merrill company.</note>
+</biblStruct>
+
+<biblStruct xml:id="b1">
+ <analytic>
+ <title level="a" type="main">Implicit definition sustained</title>
+ <author>
+ <persName><forename type="first">W</forename><forename type="middle">V O</forename><surname>Quine</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">The ways of paradox and other essays</title>
+ <meeting><address><addrLine>Cambridge, MA</addrLine></address></meeting>
+ <imprint>
+ <publisher>Harvard University Press</publisher>
+ <date type="published" when="1976">1976b</date>
+ <biblScope unit="page" from="133" to="136" />
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Quine, W. V. O. (1976b). Implicit definition sustained. In The ways of paradox and other essays (2. enlarged and revised ed., pp. 133–136). Cambridge, MA: Harvard University Press.</note>
+</biblStruct>
+
+<biblStruct xml:id="b2">
+ <monogr>
+ <title level="m" type="main">On some difficulties in the theory of transfinite numbers and order types</title>
+ <author>
+ <persName><forename type="first">B</forename><surname>Russell</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1906">1906</date>
+ <publisher>Proceedings of London Mathematical Society</publisher>
+ <biblScope unit="volume">4</biblScope>
+ <biblScope unit="page" from="29" to="53" />
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Russell, B. (1906). On some difficulties in the theory of transfinite numbers and order types. Proceedings of London Mathematical Society, 4, 29–53.</note>
+</biblStruct>
+
+ </listBibl>
+ </div>
+ </back>
+ </text>
+</TEI>
diff --git a/tests/test_parse.py b/tests/test_parse.py
index 70dcc98..5f64a48 100644
--- a/tests/test_parse.py
+++ b/tests/test_parse.py
@@ -338,3 +338,51 @@ def test_empty_citations() -> None:
assert d2
assert d2[0].index == 0
assert d2[0].unstructured == "blah"
+
+
+def test_citation_emdash() -> None:
+
+ with open("tests/files/citation_emdash.tei.xml", "rb") as f:
+ tei_xml_bytes = f.read()
+ with open("tests/files/citation_emdash.tei.xml", "r") as f2:
+ tei_xml_str = f2.read()
+
+ # that dash is a unicode emdash
+ unstructured = "Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155–172). Bobs-Merrill company."
+ assert unstructured[70:81] == "pp. 155\u2013172"
+ assert "pp. 155\u2013172".encode("utf-8") in tei_xml_bytes
+ assert "pp. 155\u2013172" in tei_xml_str
+
+ ref_bytes = parse_citation_xml(tei_xml_bytes)
+ assert ref_bytes
+ assert ref_bytes.unstructured == unstructured
+ assert ref_bytes.first_page == "155"
+ assert ref_bytes.pages == "155-172"
+
+ ref_str = parse_citation_xml(tei_xml_str)
+ assert ref_str
+ assert ref_str.unstructured == unstructured
+ assert ref_str.first_page == "155"
+ assert ref_str.pages == "155-172"
+
+
+def test_citation_list_utf8() -> None:
+ with open("tests/files/citation_list_emdash.tei.xml", "rb") as f:
+ tei_xml_bytes = f.read()
+ with open("tests/files/citation_list_emdash.tei.xml", "r") as f2:
+ tei_xml_str = f2.read()
+
+ unstructured = "Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155–172). Bobs-Merrill company."
+ assert unstructured[70:81] == "pp. 155\u2013172"
+
+ ref_bytes = parse_citation_list_xml(tei_xml_bytes)[0]
+ assert ref_bytes
+ assert ref_bytes.unstructured == unstructured
+ assert ref_bytes.first_page == "155"
+ assert ref_bytes.pages == "155-172"
+
+ ref_str = parse_citation_list_xml(tei_xml_str)[0]
+ assert ref_str
+ assert ref_str.unstructured == unstructured
+ assert ref_str.first_page == "155"
+ assert ref_str.pages == "155-172"