aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-27 20:36:46 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-27 20:36:46 -0700
commit3e9cdaadf455ab1a9bb7289f7b9fa6c889f24669 (patch)
tree87800571c10cdad8f958080f19c60fb86109178e
parent3e3902efe418677054d1711a3de1ab8c4cd2c57c (diff)
downloadgrobid_tei_xml-3e9cdaadf455ab1a9bb7289f7b9fa6c889f24669.tar.gz
grobid_tei_xml-3e9cdaadf455ab1a9bb7289f7b9fa6c889f24669.zip
add more explicit single and multiple citation parsing functions
-rw-r--r--grobid_tei_xml/__init__.py7
-rwxr-xr-xgrobid_tei_xml/parse.py26
-rw-r--r--grobid_tei_xml/types.py21
-rw-r--r--tests/files/empty_citation.tei.xml6
-rw-r--r--tests/files/empty_citation_unstructured.tei.xml7
-rw-r--r--tests/test_parse.py44
6 files changed, 106 insertions, 5 deletions
diff --git a/grobid_tei_xml/__init__.py b/grobid_tei_xml/__init__.py
index a465635..87e9363 100644
--- a/grobid_tei_xml/__init__.py
+++ b/grobid_tei_xml/__init__.py
@@ -1,4 +1,9 @@
__version__ = "0.1.1"
-from .parse import parse_citations_xml, parse_document_xml
+from .parse import (
+ parse_citation_list_xml,
+ parse_citation_xml,
+ parse_citations_xml,
+ parse_document_xml,
+)
from .types import GrobidBiblio, GrobidDocument
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py
index cd55f9a..4916b7f 100755
--- a/grobid_tei_xml/parse.py
+++ b/grobid_tei_xml/parse.py
@@ -270,7 +270,7 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
return doc
-def parse_citations_xml(xml_text: AnyStr) -> List[GrobidBiblio]:
+def parse_citation_list_xml(xml_text: AnyStr) -> List[GrobidBiblio]:
"""
Use this function to parse TEI-XML of one or more references. This should
work with either /api/processCitation or /api/processCitationList API
@@ -299,3 +299,27 @@ def parse_citations_xml(xml_text: AnyStr) -> List[GrobidBiblio]:
ref.index = i
refs.append(ref)
return refs
+
+
+def parse_citations_xml(xml_text: AnyStr) -> List[GrobidBiblio]:
+ """
+ Alias for `parse_citation_list_xml()`
+ """
+ return parse_citation_list_xml(xml_text=xml_text)
+
+
+def parse_citation_xml(xml_text: AnyStr) -> Optional[GrobidBiblio]:
+ """
+ Parses a single citation. If the result is empty, or only contains the
+ 'unstructured' field, returns None.
+ """
+ # internally, re-uses parse_citation_list_xml()
+ citation_list = parse_citation_list_xml(xml_text)
+ if not citation_list:
+ return None
+ citation = citation_list[0]
+ citation.index = None
+ if citation.is_empty():
+ return None
+ else:
+ return citation
diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py
index 725871b..32b773b 100644
--- a/grobid_tei_xml/types.py
+++ b/grobid_tei_xml/types.py
@@ -102,6 +102,27 @@ class GrobidBiblio:
istex_id: Optional[str] = None
url: Optional[str] = None
+ def is_empty(self) -> bool:
+ """
+ If true, this record is empty (contains no actual metadata)
+ """
+ if self.authors or self.editors:
+ return False
+ return not bool(
+ self.date
+ or self.title
+ or self.journal
+ or self.publisher
+ or self.volume
+ or self.issue
+ or self.pages
+ or self.doi
+ or self.pmid
+ or self.pmcid
+ or self.arxiv_id
+ or self.url
+ )
+
def to_dict(self) -> dict:
return _simplify_dict(asdict(self))
diff --git a/tests/files/empty_citation.tei.xml b/tests/files/empty_citation.tei.xml
new file mode 100644
index 0000000..cb21f6e
--- /dev/null
+++ b/tests/files/empty_citation.tei.xml
@@ -0,0 +1,6 @@
+<biblStruct >
+ <monogr>
+ <title/>
+ <imprint/>
+ </monogr>
+</biblStruct>
diff --git a/tests/files/empty_citation_unstructured.tei.xml b/tests/files/empty_citation_unstructured.tei.xml
new file mode 100644
index 0000000..35aee19
--- /dev/null
+++ b/tests/files/empty_citation_unstructured.tei.xml
@@ -0,0 +1,7 @@
+<biblStruct >
+ <monogr>
+ <title/>
+ <imprint/>
+ </monogr>
+ <note type="raw_reference">blah</note>
+</biblStruct>
diff --git a/tests/test_parse.py b/tests/test_parse.py
index 25529c4..70dcc98 100644
--- a/tests/test_parse.py
+++ b/tests/test_parse.py
@@ -5,7 +5,14 @@ import xml.etree.ElementTree
import pytest
-from grobid_tei_xml import GrobidBiblio, GrobidDocument, parse_citations_xml, parse_document_xml
+from grobid_tei_xml import (
+ GrobidBiblio,
+ GrobidDocument,
+ parse_citation_list_xml,
+ parse_citation_xml,
+ parse_citations_xml,
+ parse_document_xml,
+)
from grobid_tei_xml.types import *
@@ -198,7 +205,8 @@ def test_single_citations_xml() -> None:
</monogr>
</biblStruct>"""
- d = parse_citations_xml(citation_xml)[0]
+ d = parse_citation_xml(citation_xml)
+ assert d
assert (
d.title
== """Mesh migration following abdominal hernia repair: a comprehensive review"""
@@ -217,13 +225,21 @@ def test_single_citations_xml() -> None:
assert d.issue == "2"
assert d.journal == "Hernia"
+ d2 = parse_citations_xml(citation_xml)[0]
+ assert d.title == d2.title
+ assert d.authors == d2.authors
+
def test_citation_list_xml() -> None:
with open("tests/files/example_citation_list.xml", "r") as f:
tei_xml = f.read()
- citations = parse_citations_xml(tei_xml)
+ citations = parse_citation_list_xml(tei_xml)
+
+ # verify that old function still works
+ assert citations == parse_citations_xml(tei_xml)
+
assert len(citations) == 13
assert citations[3].note == "The Research Handbook on International Environmental Law"
@@ -300,3 +316,25 @@ def test_grobid_070_document() -> None:
c.to_csl_dict()
c.to_dict()
c.to_legacy_dict()
+
+
+def test_empty_citations() -> None:
+
+ with open("tests/files/empty_citation_unstructured.tei.xml", "r") as f:
+ mostly_empty_xml = f.read()
+
+ with open("tests/files/empty_citation.tei.xml", "r") as f:
+ empty_xml = f.read()
+
+ assert parse_citation_xml(empty_xml) is None
+ assert parse_citation_xml(mostly_empty_xml) is None
+
+ d = parse_citation_list_xml(empty_xml)
+ assert d
+ assert d[0].index == 0
+ assert d[0].unstructured is None
+
+ d2 = parse_citation_list_xml(mostly_empty_xml)
+ assert d2
+ assert d2[0].index == 0
+ assert d2[0].unstructured == "blah"