diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-29 10:48:00 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-29 10:48:06 -0700 |
commit | 2c4aba8dbc0d548f0007000c9297a39d4ab67fe2 (patch) | |
tree | e9901b207febfbabb634582de697cdc2aa73504b | |
parent | 38b758e043eef8a48eba7f1e5bf385d91b375e5e (diff) | |
download | grobid_tei_xml-2c4aba8dbc0d548f0007000c9297a39d4ab67fe2.tar.gz grobid_tei_xml-2c4aba8dbc0d548f0007000c9297a39d4ab67fe2.zip |
add UTF-8 processCitation tests
-rw-r--r-- | tests/files/citation_emdash.tei.xml | 20 | ||||
-rw-r--r-- | tests/files/citation_list_emdash.tei.xml | 66 | ||||
-rw-r--r-- | tests/test_parse.py | 48 |
3 files changed, 134 insertions, 0 deletions
diff --git a/tests/files/citation_emdash.tei.xml b/tests/files/citation_emdash.tei.xml new file mode 100644 index 0000000..846085c --- /dev/null +++ b/tests/files/citation_emdash.tei.xml @@ -0,0 +1,20 @@ +<biblStruct > + <analytic> + <title level="a" type="main">A world of individuals</title> + <author> + <persName> + <forename type="first">N</forename> + <surname>Goodman</surname> + </persName> + </author> + </analytic> + <monogr> + <title level="m">Problems and projects</title> + <imprint> + <date type="published" when="1972">1972</date> + <biblScope unit="page" from="155" to="172" /> + </imprint> + </monogr> + <note type="raw_reference">Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155–172). Bobs-Merrill company.</note> +</biblStruct> + diff --git a/tests/files/citation_list_emdash.tei.xml b/tests/files/citation_list_emdash.tei.xml new file mode 100644 index 0000000..b47f85b --- /dev/null +++ b/tests/files/citation_list_emdash.tei.xml @@ -0,0 +1,66 @@ +<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:xlink="http://www.w3.org/1999/xlink" + xmlns:mml="http://www.w3.org/1998/Math/MathML"> + <teiHeader/> + <text> + <front/> + <body/> + <back> + <div> + <listBibl> +<biblStruct xml:id="b0"> + <analytic> + <title level="a" type="main">A world of individuals</title> + <author> + <persName><forename type="first">N</forename><surname>Goodman</surname></persName> + </author> + </analytic> + <monogr> + <title level="m">Problems and projects</title> + <imprint> + <date type="published" when="1972">1972</date> + <biblScope unit="page" from="155" to="172" /> + </imprint> + </monogr> + <note type="raw_reference">Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155–172). Bobs-Merrill company.</note> +</biblStruct> + +<biblStruct xml:id="b1"> + <analytic> + <title level="a" type="main">Implicit definition sustained</title> + <author> + <persName><forename type="first">W</forename><forename type="middle">V O</forename><surname>Quine</surname></persName> + </author> + </analytic> + <monogr> + <title level="m">The ways of paradox and other essays</title> + <meeting><address><addrLine>Cambridge, MA</addrLine></address></meeting> + <imprint> + <publisher>Harvard University Press</publisher> + <date type="published" when="1976">1976b</date> + <biblScope unit="page" from="133" to="136" /> + </imprint> + </monogr> + <note type="raw_reference">Quine, W. V. O. (1976b). Implicit definition sustained. In The ways of paradox and other essays (2. enlarged and revised ed., pp. 133–136). Cambridge, MA: Harvard University Press.</note> +</biblStruct> + +<biblStruct xml:id="b2"> + <monogr> + <title level="m" type="main">On some difficulties in the theory of transfinite numbers and order types</title> + <author> + <persName><forename type="first">B</forename><surname>Russell</surname></persName> + </author> + <imprint> + <date type="published" when="1906">1906</date> + <publisher>Proceedings of London Mathematical Society</publisher> + <biblScope unit="volume">4</biblScope> + <biblScope unit="page" from="29" to="53" /> + </imprint> + </monogr> + <note type="raw_reference">Russell, B. (1906). On some difficulties in the theory of transfinite numbers and order types. Proceedings of London Mathematical Society, 4, 29–53.</note> +</biblStruct> + + </listBibl> + </div> + </back> + </text> +</TEI> diff --git a/tests/test_parse.py b/tests/test_parse.py index 70dcc98..5f64a48 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -338,3 +338,51 @@ def test_empty_citations() -> None: assert d2 assert d2[0].index == 0 assert d2[0].unstructured == "blah" + + +def test_citation_emdash() -> None: + + with open("tests/files/citation_emdash.tei.xml", "rb") as f: + tei_xml_bytes = f.read() + with open("tests/files/citation_emdash.tei.xml", "r") as f2: + tei_xml_str = f2.read() + + # that dash is a unicode emdash + unstructured = "Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155–172). Bobs-Merrill company." + assert unstructured[70:81] == "pp. 155\u2013172" + assert "pp. 155\u2013172".encode("utf-8") in tei_xml_bytes + assert "pp. 155\u2013172" in tei_xml_str + + ref_bytes = parse_citation_xml(tei_xml_bytes) + assert ref_bytes + assert ref_bytes.unstructured == unstructured + assert ref_bytes.first_page == "155" + assert ref_bytes.pages == "155-172" + + ref_str = parse_citation_xml(tei_xml_str) + assert ref_str + assert ref_str.unstructured == unstructured + assert ref_str.first_page == "155" + assert ref_str.pages == "155-172" + + +def test_citation_list_utf8() -> None: + with open("tests/files/citation_list_emdash.tei.xml", "rb") as f: + tei_xml_bytes = f.read() + with open("tests/files/citation_list_emdash.tei.xml", "r") as f2: + tei_xml_str = f2.read() + + unstructured = "Goodman, N. (1972). A world of individuals. In Problems and projects (pp. 155–172). Bobs-Merrill company." + assert unstructured[70:81] == "pp. 155\u2013172" + + ref_bytes = parse_citation_list_xml(tei_xml_bytes)[0] + assert ref_bytes + assert ref_bytes.unstructured == unstructured + assert ref_bytes.first_page == "155" + assert ref_bytes.pages == "155-172" + + ref_str = parse_citation_list_xml(tei_xml_str)[0] + assert ref_str + assert ref_str.unstructured == unstructured + assert ref_str.first_page == "155" + assert ref_str.pages == "155-172" |