summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-11-03 20:30:02 -0700
committerBryan Newbold <bnewbold@archive.org>2021-11-03 20:30:02 -0700
commit132d257befc2088a98b3fe10aa71713338d15673 (patch)
tree92e7be2493c779ed50be123bee353f0b5859f0e4
parent6ad771e195065a1f22ddbe5a4098acc70137be9b (diff)
downloadgrobid_tei_xml-132d257befc2088a98b3fe10aa71713338d15673.tar.gz
grobid_tei_xml-132d257befc2088a98b3fe10aa71713338d15673.zip
add test for XML parse exception behavior
-rw-r--r--tests/files/citation_list/parse_error.tei.xml64
-rw-r--r--tests/test_errors.py23
2 files changed, 87 insertions, 0 deletions
diff --git a/tests/files/citation_list/parse_error.tei.xml b/tests/files/citation_list/parse_error.tei.xml
new file mode 100644
index 0000000..fc7b4fd
--- /dev/null
+++ b/tests/files/citation_list/parse_error.tei.xml
@@ -0,0 +1,64 @@
+<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:xlink="http://www.w3.org/1999/xlink"
+ xmlns:mml="http://www.w3.org/1998/Math/MathML">
+ <teiHeader/>
+ <text>
+ <front/>
+ <body/>
+ <back>
+ <div>
+ <listBibl>
+
+<biblStruct xml:id="b3">
+ <analytic>
+ <title level="a" type="main">Design and Analysis of an Optimi-zation Model by using Scheduling Algorithm for Electric Power Cycles</title>
+ <author>
+ <persName><forename type="first">S</forename><surname>Maheswari</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">C</forename><surname>Vijayalakshmi</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Proceedings of the National Conference On Applied Ma-thematics</title>
+ <editor>
+ <persName><forename type="first">B</forename><forename type="middle">S</forename><surname>Abdur Rahman University</surname></persName>
+ <persName><surname>Chennai</surname></persName>
+ </editor>
+ <meeting>the National Conference On Applied Ma-thematics</meeting>
+ <imprint>
+ <date type="published" when="2010-01">NCAM 2010. January 2010</date>
+ <biblScope unit="page" from="160" to="163" />
+ </imprint>
+ </monogr>
+ <note type="raw_reference">Maheswari, S., Vijayalakshmi, C.: Design and Analysis of an Optimi-zation Model by using Scheduling Algorithm for Electric Power Cycles. In: Proceedings of the National Conference On Applied Ma-thematics (NCAM 2010), B.S. Abdur Rahman University Chennai, pp. 160–163 (January 2010)</note>
+</biblStruct>
+
+<biblStruct xml:id="b4">
+ <analytic>
+ <title level="a" type="main">Optimization Model for Electricity Distribution System Control using Communication System by La-grangian Relaxation Technique</title>
+ <author>
+ <persName><forename type="first">S</forename><surname>Maheswari</surname></persName>
+ </author>
+ <author>
+ <persName><forename type="first">C</forename><surname>Vijayalakshmi</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">CiiT International Journal of Wireless Communication</title>
+ <idno type="ISSN">0974 - 9756 & Online: 0974 - 9640</idno>
+ <imprint>
+ <biblScope unit="volume">3</biblScope>
+ <biblScope unit="issue">3</biblScope>
+ <biblScope unit="page" from="183" to="187" />
+ <date type="published" when="2011">2011</date>
+ </imprint>
+ </monogr>
+ <note type="report_type">Print:</note>
+ <note type="raw_reference">Maheswari, S., Vijayalakshmi, C.: Optimization Model for Electricity Distribution System Control using Communication System by La-grangian Relaxation Technique. CiiT International Journal of Wireless Communication 3(3), 183–187 (2011) (Print: ISSN 0974 – 9756 &amp; Online: ISSN 0974 – 9640)</note>
+</biblStruct>
+
+ </listBibl>
+ </div>
+ </back>
+ </text>
+</TEI>
diff --git a/tests/test_errors.py b/tests/test_errors.py
new file mode 100644
index 0000000..8537551
--- /dev/null
+++ b/tests/test_errors.py
@@ -0,0 +1,23 @@
+import xml.etree.ElementTree
+
+import pytest
+
+from grobid_tei_xml import parse_citation_list_xml
+
+
+def test_doc_parse_error() -> None:
+ """
+ This XML document has a bare '&' (should be '&amp;') and results in a parse
+ error.
+
+ See also: https://github.com/kermitt2/grobid/issues/848
+
+ The intent of this test is to ensure that the exception raised is the one
+ expected, especially if that behavior changes in the future.
+ """
+
+ with open("tests/files/citation_list/parse_error.tei.xml", "r") as f:
+ tei_xml = f.read()
+
+ with pytest.raises(xml.etree.ElementTree.ParseError):
+ parse_citation_list_xml(tei_xml)