aboutsummaryrefslogtreecommitdiffstats
path: root/grobid_tei_xml/parse.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-21 18:22:12 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-21 18:22:12 -0700
commit2bf52b0622005ed8a7c51e59faa9873600d9cb5f (patch)
tree6de17ab8a3f77053c4f61770011af4b7de2c4a17 /grobid_tei_xml/parse.py
parent8c09c866d81854ab06b85bee6c39124c7b2faf44 (diff)
downloadgrobid_tei_xml-2bf52b0622005ed8a7c51e59faa9873600d9cb5f.tar.gz
grobid_tei_xml-2bf52b0622005ed8a7c51e59faa9873600d9cb5f.zip
more progress
Diffstat (limited to 'grobid_tei_xml/parse.py')
-rwxr-xr-xgrobid_tei_xml/parse.py34
1 files changed, 27 insertions, 7 deletions
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py
index a239e4d..32c5d0f 100755
--- a/grobid_tei_xml/parse.py
+++ b/grobid_tei_xml/parse.py
@@ -1,4 +1,3 @@
-
import io
import json
import xml.etree.ElementTree as ET
@@ -20,9 +19,12 @@ def _string_to_tree(content: AnyStr) -> ET:
elif isinstance(content, ET):
return content
else:
- raise TypeError(f"expected XML as string or bytes, got: {type(content)}")
+ raise TypeError(
+ f"expected XML as string or bytes, got: {type(content)}")
+
-def _parse_authors(elem: Optional[ET.Element]) -> List[GrobidAffiliation]:
+def _parse_authors(elem: Optional[ET.Element],
+ ns: str = ns) -> List[GrobidAffiliation]:
if not elem:
return []
names = []
@@ -64,6 +66,7 @@ def _parse_authors(elem: Optional[ET.Element]) -> List[GrobidAffiliation]:
names.append(GrobidAuthor(**obj))
return names
+
def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation:
ref: Dict[str, Any] = dict()
ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id")
@@ -78,7 +81,7 @@ def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation:
else:
ref["journal"] = None
ref["title"] = other_title
- ref["authors"] = _parse_authors(elem)
+ ref["authors"] = _parse_authors(elem, ns=ns)
ref["publisher"] = elem.findtext(
f".//{{{ns}}}publicationStmt/{{{ns}}}publisher")
if not ref["publisher"]:
@@ -117,6 +120,7 @@ def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation:
ref["url"] = None
return GrobidCitation(**ref)
+
def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal:
journal = dict()
journal["name"] = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title")
@@ -131,6 +135,7 @@ def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal:
journal["abbrev"] = None
return GrobidJournal(**journal)
+
def _parse_header(elem: ET.Element, ns: str = ns) -> GrobidHeader:
header = elem
info = dict()
@@ -145,6 +150,7 @@ def _parse_header(elem: ET.Element, ns: str = ns) -> GrobidHeader:
info["doi"] = info["doi"].lower()
return GrobidHeader(**info)
+
def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
"""
Use this function to parse TEI-XML of a full document or header processed
@@ -155,7 +161,6 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
tree = _string_to_tree(xml_text)
tei = tree.getroot()
info = dict()
- encumbered = True
header = tei.find(f".//{{{ns}}}teiHeader")
if header is None:
@@ -188,17 +193,32 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
el = tei.find(f".//{{{ns}}}text/{{{ns}}}body")
doc.body = (el or None) and " ".join(el.itertext()).strip()
el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="acknowledgement"]')
- doc.acknowledgement = (el or None) and " ".join(
- el.itertext()).strip()
+ doc.acknowledgement = (el or None) and " ".join(el.itertext()).strip()
el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="annex"]')
doc.annex = (el or None) and " ".join(el.itertext()).strip()
return doc
+
def parse_citations_xml(xml_text: AnyStr) -> List[GrobidCitation]:
"""
Use this function to parse TEI-XML of one or more references.
Eg, the output of '/api/processReferences' or '/api/processCitation'.
"""
+ # XXX: this replacement shouldn't be needed?
+ xml_text = xml_text.replace('xmlns="http://www.tei-c.org/ns/1.0"', "")
tree = _string_to_tree(xml_text)
+ root = tree.getroot()
+
+ if root.tag == 'biblStruct':
+ ref = _parse_citation(root, ns='')
+ ref.index = 0
+ return [ref]
+
+ refs = []
+ for (i, bs) in enumerate(tree.findall(f".//biblStruct")):
+ ref = _parse_citation(bs, ns='')
+ ref.index = i
+ refs.append(ref)
+ return refs