diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-27 20:36:46 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-27 20:36:46 -0700 |
commit | 3e9cdaadf455ab1a9bb7289f7b9fa6c889f24669 (patch) | |
tree | 87800571c10cdad8f958080f19c60fb86109178e /tests | |
parent | 3e3902efe418677054d1711a3de1ab8c4cd2c57c (diff) | |
download | grobid_tei_xml-3e9cdaadf455ab1a9bb7289f7b9fa6c889f24669.tar.gz grobid_tei_xml-3e9cdaadf455ab1a9bb7289f7b9fa6c889f24669.zip |
add more explicit single and multiple citation parsing functions
Diffstat (limited to 'tests')
-rw-r--r-- | tests/files/empty_citation.tei.xml | 6 | ||||
-rw-r--r-- | tests/files/empty_citation_unstructured.tei.xml | 7 | ||||
-rw-r--r-- | tests/test_parse.py | 44 |
3 files changed, 54 insertions, 3 deletions
diff --git a/tests/files/empty_citation.tei.xml b/tests/files/empty_citation.tei.xml new file mode 100644 index 0000000..cb21f6e --- /dev/null +++ b/tests/files/empty_citation.tei.xml @@ -0,0 +1,6 @@ +<biblStruct > + <monogr> + <title/> + <imprint/> + </monogr> +</biblStruct> diff --git a/tests/files/empty_citation_unstructured.tei.xml b/tests/files/empty_citation_unstructured.tei.xml new file mode 100644 index 0000000..35aee19 --- /dev/null +++ b/tests/files/empty_citation_unstructured.tei.xml @@ -0,0 +1,7 @@ +<biblStruct > + <monogr> + <title/> + <imprint/> + </monogr> + <note type="raw_reference">blah</note> +</biblStruct> diff --git a/tests/test_parse.py b/tests/test_parse.py index 25529c4..70dcc98 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -5,7 +5,14 @@ import xml.etree.ElementTree import pytest -from grobid_tei_xml import GrobidBiblio, GrobidDocument, parse_citations_xml, parse_document_xml +from grobid_tei_xml import ( + GrobidBiblio, + GrobidDocument, + parse_citation_list_xml, + parse_citation_xml, + parse_citations_xml, + parse_document_xml, +) from grobid_tei_xml.types import * @@ -198,7 +205,8 @@ def test_single_citations_xml() -> None: </monogr> </biblStruct>""" - d = parse_citations_xml(citation_xml)[0] + d = parse_citation_xml(citation_xml) + assert d assert ( d.title == """Mesh migration following abdominal hernia repair: a comprehensive review""" @@ -217,13 +225,21 @@ def test_single_citations_xml() -> None: assert d.issue == "2" assert d.journal == "Hernia" + d2 = parse_citations_xml(citation_xml)[0] + assert d.title == d2.title + assert d.authors == d2.authors + def test_citation_list_xml() -> None: with open("tests/files/example_citation_list.xml", "r") as f: tei_xml = f.read() - citations = parse_citations_xml(tei_xml) + citations = parse_citation_list_xml(tei_xml) + + # verify that old function still works + assert citations == parse_citations_xml(tei_xml) + assert len(citations) == 13 assert citations[3].note == "The Research Handbook on International Environmental Law" @@ -300,3 +316,25 @@ def test_grobid_070_document() -> None: c.to_csl_dict() c.to_dict() c.to_legacy_dict() + + +def test_empty_citations() -> None: + + with open("tests/files/empty_citation_unstructured.tei.xml", "r") as f: + mostly_empty_xml = f.read() + + with open("tests/files/empty_citation.tei.xml", "r") as f: + empty_xml = f.read() + + assert parse_citation_xml(empty_xml) is None + assert parse_citation_xml(mostly_empty_xml) is None + + d = parse_citation_list_xml(empty_xml) + assert d + assert d[0].index == 0 + assert d[0].unstructured is None + + d2 = parse_citation_list_xml(mostly_empty_xml) + assert d2 + assert d2[0].index == 0 + assert d2[0].unstructured == "blah" |