1 files changed, 35 insertions, 54 deletions
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py
index c65cbdf..1d7eec7 100755
--- a/grobid_tei_xml/parse.py
+++ b/grobid_tei_xml/parse.py
@@ -2,8 +2,7 @@ import io
 import xml.etree.ElementTree as ET
 from typing import Any, AnyStr, Dict, List, Optional
 
-from .types import (GrobidAddress, GrobidAffiliation, GrobidAuthor, GrobidCitation,
-                    GrobidDocument, GrobidHeader, GrobidJournal)
+from .types import GrobidAddress, GrobidAffiliation, GrobidAuthor, GrobidBiblio, GrobidDocument
 
 xml_ns = "http://www.w3.org/XML/1998/namespace"
 ns = "http://www.tei-c.org/ns/1.0"
@@ -128,80 +127,62 @@ def test_clean_url() -> None:
         assert row['clean'] == _clean_url(row['dirty'])
 
 
-def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation:
+def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio:
     """
-    Parses an entire TEI 'biblStruct' XML tag
+    Parses an entire TEI 'biblStruct' or 'teiHeader' XML tag
+
+    Could be document header or a citation.
     """
 
-    citation = GrobidCitation(
+    biblio = GrobidBiblio(
+        authors=_parse_authors(elem, ns=ns),
         id=elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") or None,
+        unstructured=elem.findtext(f'.//{{{ns}}}note[@type="raw_reference"]') or None,
+
+        # date below
         title=elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") or None,
         journal=elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") or None,
-        authors=_parse_authors(elem, ns=ns),
-        unstructured=elem.findtext(f'.//{{{ns}}}note[@type="raw_reference"]') or None,
+        journal_abbrev=None,  # XXX
+        publisher=elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") or None,
         volume=elem.findtext(f'.//{{{ns}}}biblScope[@unit="volume"]') or None,
         issue=elem.findtext(f'.//{{{ns}}}biblScope[@unit="issue"]') or None,
-        arxiv_id=elem.findtext(f'.//{{{ns}}}idno[@type="arXiv"]') or None,
+        # pages below
+        # XXX: note
         doi=elem.findtext(f'.//{{{ns}}}idno[@type="DOI"]') or None,
-        pmcid=elem.findtext(f'.//{{{ns}}}idno[@type="PMCID"]') or None,
         pmid=elem.findtext(f'.//{{{ns}}}idno[@type="PMID"]') or None,
+        pmcid=elem.findtext(f'.//{{{ns}}}idno[@type="PMCID"]') or None,
+        arxiv_id=elem.findtext(f'.//{{{ns}}}idno[@type="arXiv"]') or None,
+        issn=elem.findtext(f'.//{{{ns}}}idno[@type="ISSN"]') or None,
+        eissn=elem.findtext(f'.//{{{ns}}}idno[@type="eISSN"]') or None,
     )
 
-    citation.publisher = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher")
-    if not citation.publisher:
-        citation.publisher = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") or None
+    if not biblio.publisher:
+        biblio.publisher = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") or None
 
     date_tag = elem.find(f'.//{{{ns}}}date[@type="published"]')
     if date_tag is not None:
-        citation.date = date_tag.attrib.get("when") or None
+        biblio.date = date_tag.attrib.get("when") or None
 
     # title stuff is messy in references...
-    if citation.journal and not citation.title:
-        citation.title = citation.journal
-        citation.journal = None
+    if biblio.journal and not biblio.title:
+        biblio.title = biblio.journal
+        biblio.journal = None
 
-    if citation.arxiv_id and citation.arxiv_id.startswith("arXiv:"):
-        citation.arxiv_id = citation.arxiv_id[6:]
+    if biblio.arxiv_id and biblio.arxiv_id.startswith("arXiv:"):
+        biblio.arxiv_id = biblio.arxiv_id[6:]
 
     el = elem.find(f'.//{{{ns}}}biblScope[@unit="page"]')
     if el is not None:
         if el.attrib.get("from") and el.attrib.get("to"):
-            citation.pages = "{}-{}".format(el.attrib["from"], el.attrib["to"])
+            biblio.pages = "{}-{}".format(el.attrib["from"], el.attrib["to"])
         else:
-            citation.pages = el.text
+            biblio.pages = el.text
 
     el = elem.find(f".//{{{ns}}}ptr[@target]")
     if el is not None:
-        citation.url = _clean_url(el.attrib["target"])
-
-    return citation
+        biblio.url = _clean_url(el.attrib["target"])
 
-
-def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal:
-    journal = GrobidJournal(
-        name=elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") or None,
-        publisher=elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") or None,
-        issn=elem.findtext(f'.//{{{ns}}}idno[@type="ISSN"]') or None,
-        eissn=elem.findtext(f'.//{{{ns}}}idno[@type="eISSN"]') or None,
-        volume=elem.findtext(f'.//{{{ns}}}biblScope[@unit="volume"]') or None,
-        issue=elem.findtext(f'.//{{{ns}}}biblScope[@unit="issue"]') or None,
-        # XXX: abbrev
-        abbrev=None,
-    )
-    return journal
-
-
-def _parse_header(elem: ET.Element, ns: str = ns) -> GrobidHeader:
-    header = GrobidHeader(
-        title=elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") or None,
-        authors=_parse_authors(elem.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct")),
-        journal=_parse_journal(elem) or None,
-        doi=elem.findtext(f'.//{{{ns}}}idno[@type="DOI"]') or None,
-    )
-    date_tag = elem.find(f'.//{{{ns}}}date[@type="published"]')
-    if date_tag is not None:
-        header.date = date_tag.attrib.get("when") or None
-    return header
+    return biblio
 
 
 def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
@@ -223,13 +204,13 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
     doc = GrobidDocument(
         grobid_version=application_tag.attrib["version"].strip(),
         grobid_timestamp=application_tag.attrib["when"].strip(),
-        header=_parse_header(header),
+        header=_parse_biblio(header),
         pdf_md5=header.findtext(f'.//{{{ns}}}idno[@type="MD5"]') or None,
     )
 
     refs = []
     for (i, bs) in enumerate(tei.findall(f".//{{{ns}}}listBibl/{{{ns}}}biblStruct")):
-        ref = _parse_citation(bs)
+        ref = _parse_biblio(bs)
         ref.index = i
         refs.append(ref)
     doc.citations = refs
@@ -251,7 +232,7 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
     return doc
 
 
-def parse_citations_xml(xml_text: AnyStr) -> List[GrobidCitation]:
+def parse_citations_xml(xml_text: AnyStr) -> List[GrobidBiblio]:
     """
     Use this function to parse TEI-XML of one or more references. This should
     work with either /api/processCitation or /api/processCitationList API
@@ -270,13 +251,13 @@ def parse_citations_xml(xml_text: AnyStr) -> List[GrobidCitation]:
     root = tree.getroot()
 
     if root.tag == 'biblStruct':
-        ref = _parse_citation(root, ns='')
+        ref = _parse_biblio(root, ns='')
         ref.index = 0
         return [ref]
 
     refs = []
     for (i, bs) in enumerate(tree.findall(".//biblStruct")):
-        ref = _parse_citation(bs, ns='')
+        ref = _parse_biblio(bs, ns='')
         ref.index = i
         refs.append(ref)
     return refs