From 2bf52b0622005ed8a7c51e59faa9873600d9cb5f Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Thu, 21 Oct 2021 18:22:12 -0700
Subject: more progress

---
 grobid_tei_xml/__init__.py            |   4 +-
 grobid_tei_xml/__main__.py            |  18 ++-
 grobid_tei_xml/grobid2json.py         |   1 +
 grobid_tei_xml/grobid_unstructured.py |   1 -
 grobid_tei_xml/parse.py               |  34 ++++-
 grobid_tei_xml/types.py               |  22 ++-
 tests/files/example_citation_list.xml | 278 ++++++++++++++++++++++++++++++++++
 tests/test_grobid2json.py             | 143 ++++++++---------
 tests/test_grobid_unstructured.py     |  68 ---------
 tests/test_parse.py                   | 213 ++++++++++++++++++++++++++
 10 files changed, 619 insertions(+), 163 deletions(-)
 create mode 100644 tests/files/example_citation_list.xml
 delete mode 100644 tests/test_grobid_unstructured.py
 create mode 100644 tests/test_parse.py

diff --git a/grobid_tei_xml/__init__.py b/grobid_tei_xml/__init__.py
index bf8a133..d7d4ada 100644
--- a/grobid_tei_xml/__init__.py
+++ b/grobid_tei_xml/__init__.py
@@ -1,5 +1,5 @@
 __version__ = "0.1.0"
 
-from .types import GrobidDocument, GrobidCitation
-from .parse import parse_document_xml, parse_citations_xml
 from .grobid2json import teixml2json
+from .parse import parse_citations_xml, parse_document_xml
+from .types import GrobidCitation, GrobidDocument
diff --git a/grobid_tei_xml/__main__.py b/grobid_tei_xml/__main__.py
index 489bd4e..2d10e84 100644
--- a/grobid_tei_xml/__main__.py
+++ b/grobid_tei_xml/__main__.py
@@ -1,5 +1,8 @@
+import argparse
+import json
+
+from . import parse_document_xml
 
-from .parse import parse_article
 
 def main() -> None:  # pragma no cover
     parser = argparse.ArgumentParser(
@@ -19,11 +22,14 @@ def main() -> None:  # pragma no cover
 
     for filename in args.teifiles:
         content = open(filename, "r").read()
-        print(
-            json.dumps(
-                parse_article(content, encumbered=(not args.no_encumbered)),
-                sort_keys=True,
-            ))
+        doc = parse_document_xml(content)
+        if args.no_encumbered:
+            doc.body = None
+            doc.annex = None
+            doc.acknowledgements = None
+            doc.abstract = None
+        print(json.dumps(doc.to_dict(), sort_keys=True))
+
 
 if __name__ == "__main__":  # pragma no cover
     main()
diff --git a/grobid_tei_xml/grobid2json.py b/grobid_tei_xml/grobid2json.py
index c005b31..3c56b19 100644
--- a/grobid_tei_xml/grobid2json.py
+++ b/grobid_tei_xml/grobid2json.py
@@ -215,6 +215,7 @@ def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]:
             info.pop(k)
     return info
 
+
 def transform_grobid_ref_xml(raw_xml: str) -> Optional[dict]:
     """
     Parses GROBID XML for the case of a single reference/citation string (eg,
diff --git a/grobid_tei_xml/grobid_unstructured.py b/grobid_tei_xml/grobid_unstructured.py
index bdead05..cbf7322 100644
--- a/grobid_tei_xml/grobid_unstructured.py
+++ b/grobid_tei_xml/grobid_unstructured.py
@@ -14,4 +14,3 @@ import xml.etree.ElementTree as ET
 from typing import Optional
 
 from .parse import biblio_info
-
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py
index a239e4d..32c5d0f 100755
--- a/grobid_tei_xml/parse.py
+++ b/grobid_tei_xml/parse.py
@@ -1,4 +1,3 @@
-
 import io
 import json
 import xml.etree.ElementTree as ET
@@ -20,9 +19,12 @@ def _string_to_tree(content: AnyStr) -> ET:
     elif isinstance(content, ET):
         return content
     else:
-        raise TypeError(f"expected XML as string or bytes, got: {type(content)}")
+        raise TypeError(
+            f"expected XML as string or bytes, got: {type(content)}")
+
 
-def _parse_authors(elem: Optional[ET.Element]) -> List[GrobidAffiliation]:
+def _parse_authors(elem: Optional[ET.Element],
+                   ns: str = ns) -> List[GrobidAffiliation]:
     if not elem:
         return []
     names = []
@@ -64,6 +66,7 @@ def _parse_authors(elem: Optional[ET.Element]) -> List[GrobidAffiliation]:
         names.append(GrobidAuthor(**obj))
     return names
 
+
 def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation:
     ref: Dict[str, Any] = dict()
     ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id")
@@ -78,7 +81,7 @@ def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation:
         else:
             ref["journal"] = None
             ref["title"] = other_title
-    ref["authors"] = _parse_authors(elem)
+    ref["authors"] = _parse_authors(elem, ns=ns)
     ref["publisher"] = elem.findtext(
         f".//{{{ns}}}publicationStmt/{{{ns}}}publisher")
     if not ref["publisher"]:
@@ -117,6 +120,7 @@ def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation:
         ref["url"] = None
     return GrobidCitation(**ref)
 
+
 def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal:
     journal = dict()
     journal["name"] = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title")
@@ -131,6 +135,7 @@ def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal:
     journal["abbrev"] = None
     return GrobidJournal(**journal)
 
+
 def _parse_header(elem: ET.Element, ns: str = ns) -> GrobidHeader:
     header = elem
     info = dict()
@@ -145,6 +150,7 @@ def _parse_header(elem: ET.Element, ns: str = ns) -> GrobidHeader:
         info["doi"] = info["doi"].lower()
     return GrobidHeader(**info)
 
+
 def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
     """
     Use this function to parse TEI-XML of a full document or header processed
@@ -155,7 +161,6 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
     tree = _string_to_tree(xml_text)
     tei = tree.getroot()
     info = dict()
-    encumbered = True
 
     header = tei.find(f".//{{{ns}}}teiHeader")
     if header is None:
@@ -188,17 +193,32 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
     el = tei.find(f".//{{{ns}}}text/{{{ns}}}body")
     doc.body = (el or None) and " ".join(el.itertext()).strip()
     el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="acknowledgement"]')
-    doc.acknowledgement = (el or None) and " ".join(
-        el.itertext()).strip()
+    doc.acknowledgement = (el or None) and " ".join(el.itertext()).strip()
     el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="annex"]')
     doc.annex = (el or None) and " ".join(el.itertext()).strip()
 
     return doc
 
+
 def parse_citations_xml(xml_text: AnyStr) -> List[GrobidCitation]:
     """
     Use this function to parse TEI-XML of one or more references.
     
     Eg, the output of '/api/processReferences' or '/api/processCitation'.
     """
+    # XXX: this replacement shouldn't be needed?
+    xml_text = xml_text.replace('xmlns="http://www.tei-c.org/ns/1.0"', "")
     tree = _string_to_tree(xml_text)
+    root = tree.getroot()
+
+    if root.tag == 'biblStruct':
+        ref = _parse_citation(root, ns='')
+        ref.index = 0
+        return [ref]
+
+    refs = []
+    for (i, bs) in enumerate(tree.findall(f".//biblStruct")):
+        ref = _parse_citation(bs, ns='')
+        ref.index = i
+        refs.append(ref)
+    return refs
diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py
index 795d37f..aabe424 100644
--- a/grobid_tei_xml/types.py
+++ b/grobid_tei_xml/types.py
@@ -1,6 +1,5 @@
-
+from dataclasses import asdict, dataclass
 from typing import Any, AnyStr, Dict, List, Optional
-from dataclasses import dataclass
 
 
 @dataclass
@@ -11,6 +10,7 @@ class GrobidAddress:
     country: Optional[str] = None
     country_code: Optional[str] = None
 
+
 @dataclass
 class GrobidAffiliation:
     address: Optional[GrobidAddress] = None
@@ -18,6 +18,7 @@ class GrobidAffiliation:
     department: Optional[str] = None
     laboratory: Optional[str] = None
 
+
 @dataclass
 class GrobidAuthor:
     name: Optional[str]
@@ -26,6 +27,7 @@ class GrobidAuthor:
     surname: Optional[str] = None
     affiliation: Optional[dict] = None
 
+
 @dataclass
 class GrobidCitation:
     authors: List[GrobidAuthor]
@@ -52,6 +54,7 @@ class GrobidCitation:
     def to_dict(self) -> dict:
         return _simplify_dict(asdict(self))
 
+
 @dataclass
 class GrobidJournal:
     name: Optional[str] = None
@@ -62,6 +65,7 @@ class GrobidJournal:
     issn: Optional[str] = None
     eissn: Optional[str] = None
 
+
 @dataclass
 class GrobidHeader:
     title: Optional[str] = None
@@ -71,6 +75,7 @@ class GrobidHeader:
     #TODO: note: Optional[str]
     journal: Optional[GrobidJournal] = None
 
+
 @dataclass
 class GrobidDocument:
     grobid_version: str
@@ -87,10 +92,21 @@ class GrobidDocument:
     def to_dict(self) -> dict:
         return _simplify_dict(asdict(self))
 
+
 def _simplify_dict(d: dict) -> dict:
+    """
+    Recursively remove empty dict values from a dict and all sub-lists and
+    sub-dicts.
+    """
+    if d in [None, {}, '']:
+        return None
     for k in list(d.keys()):
         if isinstance(d[k], dict):
             d[k] = _simplify_dict(d[k])
-        if d[k] in [None, [], {}, '']:
+        elif isinstance(d[k], list):
+            for i in range(len(d[k])):
+                if isinstance(d[k][i], dict):
+                    d[k][i] = _simplify_dict(d[k][i])
+        if d[k] in [None, {}, '']:
             d.pop(k)
     return d
diff --git a/tests/files/example_citation_list.xml b/tests/files/example_citation_list.xml
new file mode 100644
index 0000000..d640393
--- /dev/null
+++ b/tests/files/example_citation_list.xml
@@ -0,0 +1,278 @@
+<TEI xmlns="http://www.tei-c.org/ns/1.0" xmlns:xlink="http://www.w3.org/1999/xlink" 
+ xmlns:mml="http://www.w3.org/1998/Math/MathML">
+	<teiHeader/>
+	<text>
+		<front/>
+		<body/>
+		<back>
+			<div>
+				<listBibl>
+<biblStruct xml:id="b0">
+	<analytic>
+		<title level="a" type="main">E-commerce: the challenge for UK SMEs in the twenty-first century</title>
+		<author>
+			<persName><forename type="first">M</forename><surname>Quayle</surname></persName>
+		</author>
+	</analytic>
+	<monogr>
+		<title level="j">International Journal of Operations and Production Management</title>
+		<imprint>
+			<biblScope unit="volume">22</biblScope>
+			<biblScope unit="issue">10</biblScope>
+			<biblScope unit="page" from="1148" to="1161" />
+			<date type="published" when="2002" />
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b1">
+	<analytic>
+		<title level="a" type="main">Evolution, challenges and path forward for low temperature combustion engines</title>
+		<author>
+			<persName><forename type="first">A</forename><forename type="middle">K</forename><surname>Agarwal</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">A</forename><forename type="middle">P</forename><surname>Singh</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">R</forename><forename type="middle">K</forename><surname>Maurya</surname></persName>
+		</author>
+	</analytic>
+	<monogr>
+		<title level="j">Progress in Energy and Combustion Science</title>
+		<imprint>
+			<biblScope unit="volume">61</biblScope>
+			<biblScope unit="page" from="1" to="56" />
+			<date type="published" when="2017" />
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b2">
+	<analytic>
+		<title level="a" type="main">Thrombotic complications of central venous cath- eters in cancer patients</title>
+		<author>
+			<persName><forename type="first">D</forename><forename type="middle">J</forename><surname>Kutter</surname></persName>
+		</author>
+	</analytic>
+	<monogr>
+		<title level="j">Oncologist</title>
+		<imprint>
+			<biblScope unit="volume">9</biblScope>
+			<biblScope unit="page" from="207" to="216" />
+			<date type="published" when="2004" />
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b3">
+	<monogr>
+		<author>
+			<persName><forename type="first">K</forename><surname>Uhlířová</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">M</forename><surname>Drumbl</surname></persName>
+		</author>
+		<title level="m">Actors and Law Making in International Environmental Law</title>
+				<editor>
+			<persName><forename type="first">M</forename><surname>Fitzmaurice</surname></persName>
+			<persName><forename type="first">M</forename><surname>Brus</surname></persName>
+			<persName><forename type="first">P</forename><surname>Merkouris</surname></persName>
+		</editor>
+		<meeting><address><addrLine>Cheltenham</addrLine></address></meeting>
+		<imprint>
+			<publisher>Edward Elgar Publishing</publisher>
+			<date type="published" when="2020" />
+			<biblScope unit="volume">50</biblScope>
+		</imprint>
+	</monogr>
+	<note>The Research Handbook on International Environmental Law</note>
+</biblStruct>
+
+<biblStruct xml:id="b4">
+	<analytic>
+		<title level="a" type="main">Self as- sembly protein systems: Microbial S-layers</title>
+		<author>
+			<persName><forename type="first">U</forename><forename type="middle">B</forename><surname>Sleytr</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">M</forename><surname>Sára</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">D</forename><surname>Pum</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">B</forename><surname>Schuster</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">P</forename><surname>Messner</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">C</forename><surname>Schäffer</surname></persName>
+		</author>
+	</analytic>
+	<monogr>
+		<title level="j">Biopolymers</title>
+		<editor>A. Steinbüchel and S. Fahnestock</editor>
+		<imprint>
+			<biblScope unit="volume">7</biblScope>
+			<biblScope unit="page" from="285" to="338" />
+			<date type="published" when="2003" />
+			<publisher>Wiley-VCH</publisher>
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b5">
+	<analytic>
+		<title level="a" type="main">Self-illuminating quantum dot conjugates for in vivo imaging</title>
+		<author>
+			<persName><forename type="first">M-K</forename><surname>So</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">C</forename><surname>Xu</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">A</forename><forename type="middle">M</forename><surname>Loening</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">S</forename><forename type="middle">S</forename><surname>Gambhir</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">J</forename><surname>Rao</surname></persName>
+		</author>
+	</analytic>
+	<monogr>
+		<title level="j">Nat Biotech</title>
+		<imprint>
+			<biblScope unit="volume">24</biblScope>
+			<biblScope unit="page" from="339" to="343" />
+			<date type="published" when="2006" />
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b6">
+	<analytic>
+		<title level="a" type="main">Informed conditioning on clinical covariates increases power in case-control association studies</title>
+		<author>
+			<persName><forename type="first">N</forename><surname>Zaitlen</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">S</forename><surname>Lindström</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">B</forename><surname>Pasaniuc</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">M</forename><surname>Cornelis</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">G</forename><surname>Genovese</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">S</forename><surname>Pollack</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">B</forename><forename type="middle">I</forename><surname>Freedman</surname></persName>
+		</author>
+	</analytic>
+	<monogr>
+		<title level="j">PLoS genetics</title>
+		<imprint>
+			<biblScope unit="volume">8</biblScope>
+			<biblScope unit="issue">11</biblScope>
+			<biblScope unit="page">e1003032</biblScope>
+			<date type="published" when="2012" />
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b7">
+	<monogr>
+		<author>
+			<persName><forename type="first">K</forename><surname>Von Grebmer</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">A</forename><surname>Saltzman</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">E</forename><surname>Birol</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">D</forename><surname>Wiesmann</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">N</forename><surname>Prasai</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">S</forename><surname>Yin</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">Y</forename><surname>Yohannes</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">P</forename><surname>Menon</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">J</forename><surname>Thompson</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">A</forename><surname>Sonntag</surname></persName>
+		</author>
+		<title level="m">Global Hunger Index: The Challenge of Hidden Hunger</title>
+				<imprint>
+			<date type="published" when="2014" />
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b8">
+	<analytic>
+		<title level="a" type="main">Isolation and characterization of serum-resistant strains ofPseudomonas aeruginosa derived from serum-sensitive parental strains</title>
+		<author>
+			<persName><forename type="first">N</forename><forename type="middle">L</forename><surname>Schiller</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">D</forename><forename type="middle">R</forename><surname>Hackley</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">A</forename><surname>Morrison</surname></persName>
+		</author>
+	</analytic>
+	<monogr>
+		<title level="j">Curr Microbiol</title>
+		<imprint>
+			<biblScope unit="volume">10</biblScope>
+			<biblScope unit="page" from="185" to="190" />
+			<date type="published" when="1984" />
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b9">
+	<analytic>
+		<title level="a" type="main">Importin 7 and importin alpha/importin beta are nuclear import receptors for the glucocorticoid receptor</title>
+		<author>
+			<persName><forename type="first">N</forename><forename type="middle">D</forename><surname>Freedman</surname></persName>
+		</author>
+		<author>
+			<persName><forename type="first">K</forename><forename type="middle">R</forename><surname>Yamamoto</surname></persName>
+		</author>
+	</analytic>
+	<monogr>
+		<title level="j">Mol Biol Cell</title>
+		<imprint>
+			<biblScope unit="volume">15</biblScope>
+			<biblScope unit="page" from="2276" to="2286" />
+			<date type="published" when="2004" />
+		</imprint>
+	</monogr>
+</biblStruct>
+
+				</listBibl>
+			</div>
+		</back>
+	</text>
+</TEI>
+
diff --git a/tests/test_grobid2json.py b/tests/test_grobid2json.py
index ed5d996..a1c975e 100644
--- a/tests/test_grobid2json.py
+++ b/tests/test_grobid2json.py
@@ -3,10 +3,10 @@ import json
 import pytest
 
 from grobid_tei_xml import teixml2json, parse_document_xml, GrobidDocument, GrobidCitation
-from grobid_tei_xml.types import *
+from grobid_tei_xml.grobid2json import transform_grobid_ref_xml
 
 
-def test_teixml2json_small_xml():
+def test_small_xml():
 
     with open('tests/files/small.xml', 'r') as f:
         tei_xml = f.read()
@@ -15,80 +15,6 @@ def test_teixml2json_small_xml():
 
     assert teixml2json(tei_xml) == json_form
 
-    assert parse_document_xml(tei_xml).to_dict() == json_form
-
-def test_teixml2json_small_xml():
-
-    with open('tests/files/small.xml', 'r') as f:
-        tei_xml = f.read()
-
-    doc = parse_document_xml(tei_xml)
-    expected = GrobidDocument(
-        grobid_version='0.5.1-SNAPSHOT',
-        grobid_timestamp='2018-04-02T00:31+0000',
-        language_code='en',
-        header=GrobidHeader(
-            title="Dummy Example File",
-            authors=[
-                GrobidAuthor(
-                    name="Brewster Kahle",
-                    given_name="Brewster",
-                    surname="Kahle",
-                    affiliation=GrobidAffiliation(
-                        department="Faculty ofAgricultrial Engineering",
-                        laboratory="Plant Physiology Laboratory",
-                        institution="Technion-Israel Institute of Technology",
-                        address=GrobidAddress(
-                            post_code="32000",
-                            settlement="Haifa",
-                            country="Israel",
-                        ),
-                    )
-                ),
-                GrobidAuthor(
-                    name="J Doe",
-                    given_name="J",
-                    surname="Doe",
-                ),
-            ],
-            journal=GrobidJournal(
-                name="Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
-            ),
-            date="2000",
-        ),
-        abstract="Everything you ever wanted to know about nothing",
-        body="Introduction \nEverything starts somewhere, as somebody [1]  once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
-        citations=[
-            GrobidCitation(
-                index=0,
-                id="b0",
-                authors=[
-                    GrobidAuthor(
-                        name="A Seaperson",
-                        given_name="A",
-                        surname="Seaperson"
-                    )
-                ],
-                date="2001",
-                journal="Letters in the Alphabet",
-                title="Everything is Wonderful",
-                volume="20",
-                pages="1-11",
-            ),
-            GrobidCitation(
-                index=1,
-                id="b1",
-                authors=[],
-                date="2011-03-28",
-                journal="The Dictionary",
-                title="All about Facts",
-                volume="14",
-            ),
-        ],
-    )
-
-    assert doc == expected
-
 
 def test_invalid_xml():
 
@@ -125,3 +51,68 @@ def test_grobid_teixml2json() -> None:
         ref["unstructured"] ==
         "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19."
     )
+
+
+def test_transform_grobid_ref_xml():
+    citation_xml = """
+<biblStruct >
+    <analytic>
+        <title level="a" type="main">Mesh migration following abdominal hernia repair: a comprehensive review</title>
+        <author>
+            <persName
+                xmlns="http://www.tei-c.org/ns/1.0">
+                <forename type="first">H</forename>
+                <forename type="middle">B</forename>
+                <surname>Cunningham</surname>
+            </persName>
+        </author>
+        <author>
+            <persName
+                xmlns="http://www.tei-c.org/ns/1.0">
+                <forename type="first">J</forename>
+                <forename type="middle">J</forename>
+                <surname>Weis</surname>
+            </persName>
+        </author>
+        <author>
+            <persName
+                xmlns="http://www.tei-c.org/ns/1.0">
+                <forename type="first">L</forename>
+                <forename type="middle">R</forename>
+                <surname>Taveras</surname>
+            </persName>
+        </author>
+        <author>
+            <persName
+                xmlns="http://www.tei-c.org/ns/1.0">
+                <forename type="first">S</forename>
+                <surname>Huerta</surname>
+            </persName>
+        </author>
+        <idno type="DOI">10.1007/s10029-019-01898-9</idno>
+        <idno type="PMID">30701369</idno>
+    </analytic>
+    <monogr>
+        <title level="j">Hernia</title>
+        <imprint>
+            <biblScope unit="volume">23</biblScope>
+            <biblScope unit="issue">2</biblScope>
+            <biblScope unit="page" from="235" to="243" />
+            <date type="published" when="2019-01-30" />
+        </imprint>
+    </monogr>
+</biblStruct>"""
+
+    d = transform_grobid_ref_xml(citation_xml)
+    assert d[
+        'title'] == "Mesh migration following abdominal hernia repair: a comprehensive review"
+    assert d['authors'][2]['given_name'] == "L"
+    assert d['authors'][2]['surname'] == "Taveras"
+    assert d['authors'][2]['name'] == "L R Taveras"
+    assert d['doi'] == "10.1007/s10029-019-01898-9"
+    assert d['pmid'] == "30701369"
+    assert d['date'] == "2019-01-30"
+    assert d['pages'] == "235-243"
+    assert d['volume'] == "23"
+    assert d['issue'] == "2"
+    assert d['journal'] == "Hernia"
diff --git a/tests/test_grobid_unstructured.py b/tests/test_grobid_unstructured.py
deleted file mode 100644
index 91b7398..0000000
--- a/tests/test_grobid_unstructured.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import pytest
-
-from grobid_tei_xml.grobid2json import transform_grobid_ref_xml
-
-
-def test_transform_grobid_ref_xml():
-    citation_xml = """
-<biblStruct >
-    <analytic>
-        <title level="a" type="main">Mesh migration following abdominal hernia repair: a comprehensive review</title>
-        <author>
-            <persName
-                xmlns="http://www.tei-c.org/ns/1.0">
-                <forename type="first">H</forename>
-                <forename type="middle">B</forename>
-                <surname>Cunningham</surname>
-            </persName>
-        </author>
-        <author>
-            <persName
-                xmlns="http://www.tei-c.org/ns/1.0">
-                <forename type="first">J</forename>
-                <forename type="middle">J</forename>
-                <surname>Weis</surname>
-            </persName>
-        </author>
-        <author>
-            <persName
-                xmlns="http://www.tei-c.org/ns/1.0">
-                <forename type="first">L</forename>
-                <forename type="middle">R</forename>
-                <surname>Taveras</surname>
-            </persName>
-        </author>
-        <author>
-            <persName
-                xmlns="http://www.tei-c.org/ns/1.0">
-                <forename type="first">S</forename>
-                <surname>Huerta</surname>
-            </persName>
-        </author>
-        <idno type="DOI">10.1007/s10029-019-01898-9</idno>
-        <idno type="PMID">30701369</idno>
-    </analytic>
-    <monogr>
-        <title level="j">Hernia</title>
-        <imprint>
-            <biblScope unit="volume">23</biblScope>
-            <biblScope unit="issue">2</biblScope>
-            <biblScope unit="page" from="235" to="243" />
-            <date type="published" when="2019-01-30" />
-        </imprint>
-    </monogr>
-</biblStruct>"""
-
-    d = transform_grobid_ref_xml(citation_xml)
-    assert d[
-        'title'] == "Mesh migration following abdominal hernia repair: a comprehensive review"
-    assert d['authors'][2]['given_name'] == "L"
-    assert d['authors'][2]['surname'] == "Taveras"
-    assert d['authors'][2]['name'] == "L R Taveras"
-    assert d['doi'] == "10.1007/s10029-019-01898-9"
-    assert d['pmid'] == "30701369"
-    assert d['date'] == "2019-01-30"
-    assert d['pages'] == "235-243"
-    assert d['volume'] == "23"
-    assert d['issue'] == "2"
-    assert d['journal'] == "Hernia"
diff --git a/tests/test_parse.py b/tests/test_parse.py
new file mode 100644
index 0000000..e79d41d
--- /dev/null
+++ b/tests/test_parse.py
@@ -0,0 +1,213 @@
+import xml
+import json
+import pytest
+
+from grobid_tei_xml import teixml2json, parse_document_xml, parse_citations_xml, GrobidDocument, GrobidCitation
+from grobid_tei_xml.types import *
+
+
+def test_small_xml():
+
+    with open('tests/files/small.xml', 'r') as f:
+        tei_xml = f.read()
+
+    doc = parse_document_xml(tei_xml)
+    expected = GrobidDocument(
+        grobid_version='0.5.1-SNAPSHOT',
+        grobid_timestamp='2018-04-02T00:31+0000',
+        language_code='en',
+        header=GrobidHeader(
+            title="Dummy Example File",
+            authors=[
+                GrobidAuthor(
+                    name="Brewster Kahle",
+                    given_name="Brewster",
+                    surname="Kahle",
+                    affiliation=GrobidAffiliation(
+                        department="Faculty ofAgricultrial Engineering",
+                        laboratory="Plant Physiology Laboratory",
+                        institution="Technion-Israel Institute of Technology",
+                        address=GrobidAddress(
+                            post_code="32000",
+                            settlement="Haifa",
+                            country="Israel",
+                        ),
+                    )),
+                GrobidAuthor(
+                    name="J Doe",
+                    given_name="J",
+                    surname="Doe",
+                ),
+            ],
+            journal=GrobidJournal(
+                name=
+                "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
+            ),
+            date="2000",
+        ),
+        abstract="Everything you ever wanted to know about nothing",
+        body=
+        "Introduction \nEverything starts somewhere, as somebody [1]  once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
+        citations=[
+            GrobidCitation(
+                index=0,
+                id="b0",
+                authors=[
+                    GrobidAuthor(name="A Seaperson",
+                                 given_name="A",
+                                 surname="Seaperson")
+                ],
+                date="2001",
+                journal="Letters in the Alphabet",
+                title="Everything is Wonderful",
+                volume="20",
+                pages="1-11",
+            ),
+            GrobidCitation(
+                index=1,
+                id="b1",
+                authors=[],
+                date="2011-03-28",
+                journal="The Dictionary",
+                title="All about Facts",
+                volume="14",
+            ),
+        ],
+    )
+
+    assert doc == expected
+
+
+def test_small_xml_json():
+
+    with open('tests/files/small.xml', 'r') as f:
+        tei_xml = f.read()
+    with open('tests/files/small.json', 'r') as f:
+        json_form = json.loads(f.read())
+
+    d = parse_document_xml(tei_xml).to_dict()
+
+    # munge back to the old JSON format
+    d.update(d.pop('header'))
+    addr = d['authors'][0]['affiliation']['address']
+    addr['postCode'] = addr.pop('post_code')
+
+    # remove nulls from old JSON
+    for c in json_form['citations']:
+        for k in list(c.keys()):
+            if c[k] == None:
+                c.pop(k)
+
+    assert d == json_form
+
+
+def test_invalid_xml():
+
+    with pytest.raises(xml.etree.ElementTree.ParseError):
+        parse_document_xml("this is not XML")
+    with pytest.raises(xml.etree.ElementTree.ParseError):
+        parse_citations_xml("this is not XML")
+    with pytest.raises(ValueError):
+        parse_document_xml("<xml></xml>")
+
+
+def test_example_grobid_tei_xml() -> None:
+
+    with open("tests/files/example_grobid.tei.xml", "r") as f:
+        blob = f.read()
+
+    doc = parse_document_xml(blob)
+
+    assert (
+        doc.header.title ==
+        "Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network"
+    )
+
+    ref = [c for c in doc.citations if c.id == "b12"][0]
+    assert ref.authors[0].name == "K Tasa"
+    assert ref.authors[0].given_name == "K"
+    assert ref.authors[0].surname == "Tasa"
+    assert ref.journal == "Quality Management in Health Care"
+    assert ref.title == "Using patient feedback for quality improvement"
+    assert ref.date == "1996"
+    assert ref.pages == "206-225"
+    assert ref.volume == "8"
+    assert (
+        ref.unstructured ==
+        "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19."
+    )
+
+
+def test_single_citations_xml():
+    citation_xml = """
+<biblStruct >
+    <analytic>
+        <title level="a" type="main">Mesh migration following abdominal hernia repair: a comprehensive review</title>
+        <author>
+            <persName
+                xmlns="http://www.tei-c.org/ns/1.0">
+                <forename type="first">H</forename>
+                <forename type="middle">B</forename>
+                <surname>Cunningham</surname>
+            </persName>
+        </author>
+        <author>
+            <persName
+                xmlns="http://www.tei-c.org/ns/1.0">
+                <forename type="first">J</forename>
+                <forename type="middle">J</forename>
+                <surname>Weis</surname>
+            </persName>
+        </author>
+        <author>
+            <persName
+                xmlns="http://www.tei-c.org/ns/1.0">
+                <forename type="first">L</forename>
+                <forename type="middle">R</forename>
+                <surname>Taveras</surname>
+            </persName>
+        </author>
+        <author>
+            <persName
+                xmlns="http://www.tei-c.org/ns/1.0">
+                <forename type="first">S</forename>
+                <surname>Huerta</surname>
+            </persName>
+        </author>
+        <idno type="DOI">10.1007/s10029-019-01898-9</idno>
+        <idno type="PMID">30701369</idno>
+    </analytic>
+    <monogr>
+        <title level="j">Hernia</title>
+        <imprint>
+            <biblScope unit="volume">23</biblScope>
+            <biblScope unit="issue">2</biblScope>
+            <biblScope unit="page" from="235" to="243" />
+            <date type="published" when="2019-01-30" />
+        </imprint>
+    </monogr>
+</biblStruct>"""
+
+    d = parse_citations_xml(citation_xml)[0]
+    assert d.title == "Mesh migration following abdominal hernia repair: a comprehensive review"
+    assert d.authors[2].given_name == "L"
+    assert d.authors[2].surname == "Taveras"
+    assert d.authors[2].name == "L R Taveras"
+    assert d.doi == "10.1007/s10029-019-01898-9"
+    assert d.pmid == "30701369"
+    assert d.date == "2019-01-30"
+    assert d.pages == "235-243"
+    assert d.volume == "23"
+    assert d.issue == "2"
+    assert d.journal == "Hernia"
+
+
+def test_citation_list_xml():
+
+    with open('tests/files/example_citation_list.xml', 'r') as f:
+        tei_xml = f.read()
+
+    citations = parse_citations_xml(tei_xml)
+    assert len(citations) == 10
+    assert citations[
+        7].title == "Global Hunger Index: The Challenge of Hidden Hunger"
-- 
cgit v1.2.3