From 3e9cdaadf455ab1a9bb7289f7b9fa6c889f24669 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 27 Oct 2021 20:36:46 -0700 Subject: add more explicit single and multiple citation parsing functions --- grobid_tei_xml/__init__.py | 7 +++- grobid_tei_xml/parse.py | 26 ++++++++++++++- grobid_tei_xml/types.py | 21 ++++++++++++ tests/files/empty_citation.tei.xml | 6 ++++ tests/files/empty_citation_unstructured.tei.xml | 7 ++++ tests/test_parse.py | 44 +++++++++++++++++++++++-- 6 files changed, 106 insertions(+), 5 deletions(-) create mode 100644 tests/files/empty_citation.tei.xml create mode 100644 tests/files/empty_citation_unstructured.tei.xml diff --git a/grobid_tei_xml/__init__.py b/grobid_tei_xml/__init__.py index a465635..87e9363 100644 --- a/grobid_tei_xml/__init__.py +++ b/grobid_tei_xml/__init__.py @@ -1,4 +1,9 @@ __version__ = "0.1.1" -from .parse import parse_citations_xml, parse_document_xml +from .parse import ( + parse_citation_list_xml, + parse_citation_xml, + parse_citations_xml, + parse_document_xml, +) from .types import GrobidBiblio, GrobidDocument diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py index cd55f9a..4916b7f 100755 --- a/grobid_tei_xml/parse.py +++ b/grobid_tei_xml/parse.py @@ -270,7 +270,7 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: return doc -def parse_citations_xml(xml_text: AnyStr) -> List[GrobidBiblio]: +def parse_citation_list_xml(xml_text: AnyStr) -> List[GrobidBiblio]: """ Use this function to parse TEI-XML of one or more references. This should work with either /api/processCitation or /api/processCitationList API @@ -299,3 +299,27 @@ def parse_citations_xml(xml_text: AnyStr) -> List[GrobidBiblio]: ref.index = i refs.append(ref) return refs + + +def parse_citations_xml(xml_text: AnyStr) -> List[GrobidBiblio]: + """ + Alias for `parse_citation_list_xml()` + """ + return parse_citation_list_xml(xml_text=xml_text) + + +def parse_citation_xml(xml_text: AnyStr) -> Optional[GrobidBiblio]: + """ + Parses a single citation. If the result is empty, or only contains the + 'unstructured' field, returns None. + """ + # internally, re-uses parse_citation_list_xml() + citation_list = parse_citation_list_xml(xml_text) + if not citation_list: + return None + citation = citation_list[0] + citation.index = None + if citation.is_empty(): + return None + else: + return citation diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py index 725871b..32b773b 100644 --- a/grobid_tei_xml/types.py +++ b/grobid_tei_xml/types.py @@ -102,6 +102,27 @@ class GrobidBiblio: istex_id: Optional[str] = None url: Optional[str] = None + def is_empty(self) -> bool: + """ + If true, this record is empty (contains no actual metadata) + """ + if self.authors or self.editors: + return False + return not bool( + self.date + or self.title + or self.journal + or self.publisher + or self.volume + or self.issue + or self.pages + or self.doi + or self.pmid + or self.pmcid + or self.arxiv_id + or self.url + ) + def to_dict(self) -> dict: return _simplify_dict(asdict(self)) diff --git a/tests/files/empty_citation.tei.xml b/tests/files/empty_citation.tei.xml new file mode 100644 index 0000000..cb21f6e --- /dev/null +++ b/tests/files/empty_citation.tei.xml @@ -0,0 +1,6 @@ + + + + <imprint/> + </monogr> +</biblStruct> diff --git a/tests/files/empty_citation_unstructured.tei.xml b/tests/files/empty_citation_unstructured.tei.xml new file mode 100644 index 0000000..35aee19 --- /dev/null +++ b/tests/files/empty_citation_unstructured.tei.xml @@ -0,0 +1,7 @@ +<biblStruct > + <monogr> + <title/> + <imprint/> + </monogr> + <note type="raw_reference">blah</note> +</biblStruct> diff --git a/tests/test_parse.py b/tests/test_parse.py index 25529c4..70dcc98 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -5,7 +5,14 @@ import xml.etree.ElementTree import pytest -from grobid_tei_xml import GrobidBiblio, GrobidDocument, parse_citations_xml, parse_document_xml +from grobid_tei_xml import ( + GrobidBiblio, + GrobidDocument, + parse_citation_list_xml, + parse_citation_xml, + parse_citations_xml, + parse_document_xml, +) from grobid_tei_xml.types import * @@ -198,7 +205,8 @@ def test_single_citations_xml() -> None: </monogr> </biblStruct>""" - d = parse_citations_xml(citation_xml)[0] + d = parse_citation_xml(citation_xml) + assert d assert ( d.title == """Mesh migration following abdominal hernia repair: a comprehensive review""" @@ -217,13 +225,21 @@ def test_single_citations_xml() -> None: assert d.issue == "2" assert d.journal == "Hernia" + d2 = parse_citations_xml(citation_xml)[0] + assert d.title == d2.title + assert d.authors == d2.authors + def test_citation_list_xml() -> None: with open("tests/files/example_citation_list.xml", "r") as f: tei_xml = f.read() - citations = parse_citations_xml(tei_xml) + citations = parse_citation_list_xml(tei_xml) + + # verify that old function still works + assert citations == parse_citations_xml(tei_xml) + assert len(citations) == 13 assert citations[3].note == "The Research Handbook on International Environmental Law" @@ -300,3 +316,25 @@ def test_grobid_070_document() -> None: c.to_csl_dict() c.to_dict() c.to_legacy_dict() + + +def test_empty_citations() -> None: + + with open("tests/files/empty_citation_unstructured.tei.xml", "r") as f: + mostly_empty_xml = f.read() + + with open("tests/files/empty_citation.tei.xml", "r") as f: + empty_xml = f.read() + + assert parse_citation_xml(empty_xml) is None + assert parse_citation_xml(mostly_empty_xml) is None + + d = parse_citation_list_xml(empty_xml) + assert d + assert d[0].index == 0 + assert d[0].unstructured is None + + d2 = parse_citation_list_xml(mostly_empty_xml) + assert d2 + assert d2[0].index == 0 + assert d2[0].unstructured == "blah" -- cgit v1.2.3