more progress

author: Bryan Newbold <bnewbold@archive.org> 2021-10-21 18:22:12 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-10-21 18:22:12 -0700
commit: 2bf52b0622005ed8a7c51e59faa9873600d9cb5f (patch)
tree: 6de17ab8a3f77053c4f61770011af4b7de2c4a17 /grobid_tei_xml/parse.py
parent: 8c09c866d81854ab06b85bee6c39124c7b2faf44 (diff)
download: grobid_tei_xml-2bf52b0622005ed8a7c51e59faa9873600d9cb5f.tar.gz
grobid_tei_xml-2bf52b0622005ed8a7c51e59faa9873600d9cb5f.zip
1 files changed, 27 insertions, 7 deletions
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py
index a239e4d..32c5d0f 100755
--- a/grobid_tei_xml/parse.py
+++ b/grobid_tei_xml/parse.py
@@ -1,4 +1,3 @@
-
 import io
 import json
 import xml.etree.ElementTree as ET
@@ -20,9 +19,12 @@ def _string_to_tree(content: AnyStr) -> ET:
     elif isinstance(content, ET):
         return content
     else:
-        raise TypeError(f"expected XML as string or bytes, got: {type(content)}")
+        raise TypeError(
+            f"expected XML as string or bytes, got: {type(content)}")
+
 
-def _parse_authors(elem: Optional[ET.Element]) -> List[GrobidAffiliation]:
+def _parse_authors(elem: Optional[ET.Element],
+                   ns: str = ns) -> List[GrobidAffiliation]:
     if not elem:
         return []
     names = []
@@ -64,6 +66,7 @@ def _parse_authors(elem: Optional[ET.Element]) -> List[GrobidAffiliation]:
         names.append(GrobidAuthor(**obj))
     return names
 
+
 def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation:
     ref: Dict[str, Any] = dict()
     ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id")
@@ -78,7 +81,7 @@ def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation:
         else:
             ref["journal"] = None
             ref["title"] = other_title
-    ref["authors"] = _parse_authors(elem)
+    ref["authors"] = _parse_authors(elem, ns=ns)
     ref["publisher"] = elem.findtext(
         f".//{{{ns}}}publicationStmt/{{{ns}}}publisher")
     if not ref["publisher"]:
@@ -117,6 +120,7 @@ def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation:
         ref["url"] = None
     return GrobidCitation(**ref)
 
+
 def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal:
     journal = dict()
     journal["name"] = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title")
@@ -131,6 +135,7 @@ def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal:
     journal["abbrev"] = None
     return GrobidJournal(**journal)
 
+
 def _parse_header(elem: ET.Element, ns: str = ns) -> GrobidHeader:
     header = elem
     info = dict()
@@ -145,6 +150,7 @@ def _parse_header(elem: ET.Element, ns: str = ns) -> GrobidHeader:
         info["doi"] = info["doi"].lower()
     return GrobidHeader(**info)
 
+
 def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
     """
     Use this function to parse TEI-XML of a full document or header processed
@@ -155,7 +161,6 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
     tree = _string_to_tree(xml_text)
     tei = tree.getroot()
     info = dict()
-    encumbered = True
 
     header = tei.find(f".//{{{ns}}}teiHeader")
     if header is None:
@@ -188,17 +193,32 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
     el = tei.find(f".//{{{ns}}}text/{{{ns}}}body")
     doc.body = (el or None) and " ".join(el.itertext()).strip()
     el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="acknowledgement"]')
-    doc.acknowledgement = (el or None) and " ".join(
-        el.itertext()).strip()
+    doc.acknowledgement = (el or None) and " ".join(el.itertext()).strip()
     el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="annex"]')
     doc.annex = (el or None) and " ".join(el.itertext()).strip()
 
     return doc
 
+
 def parse_citations_xml(xml_text: AnyStr) -> List[GrobidCitation]:
     """
     Use this function to parse TEI-XML of one or more references.
     
     Eg, the output of '/api/processReferences' or '/api/processCitation'.
     """
+    # XXX: this replacement shouldn't be needed?
+    xml_text = xml_text.replace('xmlns="http://www.tei-c.org/ns/1.0"', "")
     tree = _string_to_tree(xml_text)
+    root = tree.getroot()
+
+    if root.tag == 'biblStruct':
+        ref = _parse_citation(root, ns='')
+        ref.index = 0
+        return [ref]
+
+    refs = []
+    for (i, bs) in enumerate(tree.findall(f".//biblStruct")):
+        ref = _parse_citation(bs, ns='')
+        ref.index = i
+        refs.append(ref)
+    return refs
author	Bryan Newbold <bnewbold@archive.org>	2021-10-21 18:22:12 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-10-21 18:22:12 -0700
commit	2bf52b0622005ed8a7c51e59faa9873600d9cb5f (patch)
tree	6de17ab8a3f77053c4f61770011af4b7de2c4a17 /grobid_tei_xml/parse.py
parent	8c09c866d81854ab06b85bee6c39124c7b2faf44 (diff)
download	grobid_tei_xml-2bf52b0622005ed8a7c51e59faa9873600d9cb5f.tar.gz grobid_tei_xml-2bf52b0622005ed8a7c51e59faa9873600d9cb5f.zip