diff options
Diffstat (limited to 'grobid_tei_xml/parse.py')
-rwxr-xr-x | grobid_tei_xml/parse.py | 34 |
1 files changed, 27 insertions, 7 deletions
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py index a239e4d..32c5d0f 100755 --- a/grobid_tei_xml/parse.py +++ b/grobid_tei_xml/parse.py @@ -1,4 +1,3 @@ - import io import json import xml.etree.ElementTree as ET @@ -20,9 +19,12 @@ def _string_to_tree(content: AnyStr) -> ET: elif isinstance(content, ET): return content else: - raise TypeError(f"expected XML as string or bytes, got: {type(content)}") + raise TypeError( + f"expected XML as string or bytes, got: {type(content)}") + -def _parse_authors(elem: Optional[ET.Element]) -> List[GrobidAffiliation]: +def _parse_authors(elem: Optional[ET.Element], + ns: str = ns) -> List[GrobidAffiliation]: if not elem: return [] names = [] @@ -64,6 +66,7 @@ def _parse_authors(elem: Optional[ET.Element]) -> List[GrobidAffiliation]: names.append(GrobidAuthor(**obj)) return names + def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation: ref: Dict[str, Any] = dict() ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") @@ -78,7 +81,7 @@ def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation: else: ref["journal"] = None ref["title"] = other_title - ref["authors"] = _parse_authors(elem) + ref["authors"] = _parse_authors(elem, ns=ns) ref["publisher"] = elem.findtext( f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") if not ref["publisher"]: @@ -117,6 +120,7 @@ def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation: ref["url"] = None return GrobidCitation(**ref) + def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal: journal = dict() journal["name"] = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") @@ -131,6 +135,7 @@ def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal: journal["abbrev"] = None return GrobidJournal(**journal) + def _parse_header(elem: ET.Element, ns: str = ns) -> GrobidHeader: header = elem info = dict() @@ -145,6 +150,7 @@ def _parse_header(elem: ET.Element, ns: str = ns) -> GrobidHeader: info["doi"] = info["doi"].lower() return GrobidHeader(**info) + def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: """ Use this function to parse TEI-XML of a full document or header processed @@ -155,7 +161,6 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: tree = _string_to_tree(xml_text) tei = tree.getroot() info = dict() - encumbered = True header = tei.find(f".//{{{ns}}}teiHeader") if header is None: @@ -188,17 +193,32 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: el = tei.find(f".//{{{ns}}}text/{{{ns}}}body") doc.body = (el or None) and " ".join(el.itertext()).strip() el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="acknowledgement"]') - doc.acknowledgement = (el or None) and " ".join( - el.itertext()).strip() + doc.acknowledgement = (el or None) and " ".join(el.itertext()).strip() el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="annex"]') doc.annex = (el or None) and " ".join(el.itertext()).strip() return doc + def parse_citations_xml(xml_text: AnyStr) -> List[GrobidCitation]: """ Use this function to parse TEI-XML of one or more references. Eg, the output of '/api/processReferences' or '/api/processCitation'. """ + # XXX: this replacement shouldn't be needed? + xml_text = xml_text.replace('xmlns="http://www.tei-c.org/ns/1.0"', "") tree = _string_to_tree(xml_text) + root = tree.getroot() + + if root.tag == 'biblStruct': + ref = _parse_citation(root, ns='') + ref.index = 0 + return [ref] + + refs = [] + for (i, bs) in enumerate(tree.findall(f".//biblStruct")): + ref = _parse_citation(bs, ns='') + ref.index = i + refs.append(ref) + return refs |