refactor GrobidHeader and GrobidCitation into GrobidBiblio

author: Bryan Newbold <bnewbold@archive.org> 2021-10-22 19:25:24 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-10-22 19:25:24 -0700
commit: 09668907c81492774986e11f0acd9b06090dfbe0 (patch)
tree: bbe8ebc3200ce3019aaf2620dbed55d69448fa19
parent: 331288cd15d471b2ae362bf6c990f627686067d8 (diff)
download: grobid_tei_xml-09668907c81492774986e11f0acd9b06090dfbe0.tar.gz
grobid_tei_xml-09668907c81492774986e11f0acd9b06090dfbe0.zip
5 files changed, 68 insertions, 120 deletions
diff --git a/grobid_tei_xml/__init__.py b/grobid_tei_xml/__init__.py
index d7d4ada..9ddce29 100644
--- a/grobid_tei_xml/__init__.py
+++ b/grobid_tei_xml/__init__.py
@@ -2,4 +2,4 @@ __version__ = "0.1.0"
 
 from .grobid2json import teixml2json
 from .parse import parse_citations_xml, parse_document_xml
-from .types import GrobidCitation, GrobidDocument
+from .types import GrobidBiblio, GrobidDocument
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py
index c65cbdf..1d7eec7 100755
--- a/grobid_tei_xml/parse.py
+++ b/grobid_tei_xml/parse.py
@@ -2,8 +2,7 @@ import io
 import xml.etree.ElementTree as ET
 from typing import Any, AnyStr, Dict, List, Optional
 
-from .types import (GrobidAddress, GrobidAffiliation, GrobidAuthor, GrobidCitation,
-                    GrobidDocument, GrobidHeader, GrobidJournal)
+from .types import GrobidAddress, GrobidAffiliation, GrobidAuthor, GrobidBiblio, GrobidDocument
 
 xml_ns = "http://www.w3.org/XML/1998/namespace"
 ns = "http://www.tei-c.org/ns/1.0"
@@ -128,80 +127,62 @@ def test_clean_url() -> None:
         assert row['clean'] == _clean_url(row['dirty'])
 
 
-def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation:
+def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio:
     """
-    Parses an entire TEI 'biblStruct' XML tag
+    Parses an entire TEI 'biblStruct' or 'teiHeader' XML tag
+
+    Could be document header or a citation.
     """
 
-    citation = GrobidCitation(
+    biblio = GrobidBiblio(
+        authors=_parse_authors(elem, ns=ns),
         id=elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") or None,
+        unstructured=elem.findtext(f'.//{{{ns}}}note[@type="raw_reference"]') or None,
+
+        # date below
         title=elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") or None,
         journal=elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") or None,
-        authors=_parse_authors(elem, ns=ns),
-        unstructured=elem.findtext(f'.//{{{ns}}}note[@type="raw_reference"]') or None,
+        journal_abbrev=None,  # XXX
+        publisher=elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") or None,
         volume=elem.findtext(f'.//{{{ns}}}biblScope[@unit="volume"]') or None,
         issue=elem.findtext(f'.//{{{ns}}}biblScope[@unit="issue"]') or None,
-        arxiv_id=elem.findtext(f'.//{{{ns}}}idno[@type="arXiv"]') or None,
+        # pages below
+        # XXX: note
         doi=elem.findtext(f'.//{{{ns}}}idno[@type="DOI"]') or None,
-        pmcid=elem.findtext(f'.//{{{ns}}}idno[@type="PMCID"]') or None,
         pmid=elem.findtext(f'.//{{{ns}}}idno[@type="PMID"]') or None,
+        pmcid=elem.findtext(f'.//{{{ns}}}idno[@type="PMCID"]') or None,
+        arxiv_id=elem.findtext(f'.//{{{ns}}}idno[@type="arXiv"]') or None,
+        issn=elem.findtext(f'.//{{{ns}}}idno[@type="ISSN"]') or None,
+        eissn=elem.findtext(f'.//{{{ns}}}idno[@type="eISSN"]') or None,
     )
 
-    citation.publisher = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher")
-    if not citation.publisher:
-        citation.publisher = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") or None
+    if not biblio.publisher:
+        biblio.publisher = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") or None
 
     date_tag = elem.find(f'.//{{{ns}}}date[@type="published"]')
     if date_tag is not None:
-        citation.date = date_tag.attrib.get("when") or None
+        biblio.date = date_tag.attrib.get("when") or None
 
     # title stuff is messy in references...
-    if citation.journal and not citation.title:
-        citation.title = citation.journal
-        citation.journal = None
+    if biblio.journal and not biblio.title:
+        biblio.title = biblio.journal
+        biblio.journal = None
 
-    if citation.arxiv_id and citation.arxiv_id.startswith("arXiv:"):
-        citation.arxiv_id = citation.arxiv_id[6:]
+    if biblio.arxiv_id and biblio.arxiv_id.startswith("arXiv:"):
+        biblio.arxiv_id = biblio.arxiv_id[6:]
 
     el = elem.find(f'.//{{{ns}}}biblScope[@unit="page"]')
     if el is not None:
         if el.attrib.get("from") and el.attrib.get("to"):
-            citation.pages = "{}-{}".format(el.attrib["from"], el.attrib["to"])
+            biblio.pages = "{}-{}".format(el.attrib["from"], el.attrib["to"])
         else:
-            citation.pages = el.text
+            biblio.pages = el.text
 
     el = elem.find(f".//{{{ns}}}ptr[@target]")
     if el is not None:
-        citation.url = _clean_url(el.attrib["target"])
-
-    return citation
+        biblio.url = _clean_url(el.attrib["target"])
 
-
-def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal:
-    journal = GrobidJournal(
-        name=elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") or None,
-        publisher=elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") or None,
-        issn=elem.findtext(f'.//{{{ns}}}idno[@type="ISSN"]') or None,
-        eissn=elem.findtext(f'.//{{{ns}}}idno[@type="eISSN"]') or None,
-        volume=elem.findtext(f'.//{{{ns}}}biblScope[@unit="volume"]') or None,
-        issue=elem.findtext(f'.//{{{ns}}}biblScope[@unit="issue"]') or None,
-        # XXX: abbrev
-        abbrev=None,
-    )
-    return journal
-
-
-def _parse_header(elem: ET.Element, ns: str = ns) -> GrobidHeader:
-    header = GrobidHeader(
-        title=elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") or None,
-        authors=_parse_authors(elem.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct")),
-        journal=_parse_journal(elem) or None,
-        doi=elem.findtext(f'.//{{{ns}}}idno[@type="DOI"]') or None,
-    )
-    date_tag = elem.find(f'.//{{{ns}}}date[@type="published"]')
-    if date_tag is not None:
-        header.date = date_tag.attrib.get("when") or None
-    return header
+    return biblio
 
 
 def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
@@ -223,13 +204,13 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
     doc = GrobidDocument(
         grobid_version=application_tag.attrib["version"].strip(),
         grobid_timestamp=application_tag.attrib["when"].strip(),
-        header=_parse_header(header),
+        header=_parse_biblio(header),
         pdf_md5=header.findtext(f'.//{{{ns}}}idno[@type="MD5"]') or None,
     )
 
     refs = []
     for (i, bs) in enumerate(tei.findall(f".//{{{ns}}}listBibl/{{{ns}}}biblStruct")):
-        ref = _parse_citation(bs)
+        ref = _parse_biblio(bs)
         ref.index = i
         refs.append(ref)
     doc.citations = refs
@@ -251,7 +232,7 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
     return doc
 
 
-def parse_citations_xml(xml_text: AnyStr) -> List[GrobidCitation]:
+def parse_citations_xml(xml_text: AnyStr) -> List[GrobidBiblio]:
     """
     Use this function to parse TEI-XML of one or more references. This should
     work with either /api/processCitation or /api/processCitationList API
@@ -270,13 +251,13 @@ def parse_citations_xml(xml_text: AnyStr) -> List[GrobidCitation]:
     root = tree.getroot()
 
     if root.tag == 'biblStruct':
-        ref = _parse_citation(root, ns='')
+        ref = _parse_biblio(root, ns='')
         ref.index = 0
         return [ref]
 
     refs = []
     for (i, bs) in enumerate(tree.findall(".//biblStruct")):
-        ref = _parse_citation(bs, ns='')
+        ref = _parse_biblio(bs, ns='')
         ref.index = i
         refs.append(ref)
     return refs
diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py
index 199e746..8356c8e 100644
--- a/grobid_tei_xml/types.py
+++ b/grobid_tei_xml/types.py
@@ -71,28 +71,32 @@ def test_csl_date() -> None:
 
 
 @dataclass
-class GrobidCitation:
+class GrobidBiblio:
     authors: List[GrobidAuthor]
-
     index: Optional[int] = None
     id: Optional[str] = None
+    unstructured: Optional[str] = None
+
     date: Optional[str] = None
-    issue: Optional[str] = None
+    title: Optional[str] = None
     journal: Optional[str] = None  # XXX: venue? other?
+    journal_abbrev: Optional[str] = None
     publisher: Optional[str] = None
-    title: Optional[str] = None
-    url: Optional[str] = None
+    issn: Optional[str] = None
+    eissn: Optional[str] = None
     volume: Optional[str] = None
+    issue: Optional[str] = None
     pages: Optional[str] = None
     first_page: Optional[str] = None  # XXX
     last_page: Optional[str] = None  # XXX
-    unstructured: Optional[str] = None
-    arxiv_id: Optional[str] = None
+    note: Optional[str] = None
+
     doi: Optional[str] = None
     pmid: Optional[str] = None
     pmcid: Optional[str] = None
+    arxiv_id: Optional[str] = None
+    url: Optional[str] = None
     oa_url: Optional[str] = None
-    note: Optional[str] = None
 
     def to_dict(self) -> dict:
         return _simplify_dict(asdict(self))
@@ -113,6 +117,7 @@ class GrobidCitation:
             DOI=self.doi,
             PMID=self.pmid,
             PMCID=self.pmcid,
+            ISSN=self.issn,
             note=self.note,
             # fields with '-' in the key name
             **{
@@ -130,56 +135,14 @@ class GrobidCitation:
 
 
 @dataclass
-class GrobidJournal:
-    name: Optional[str] = None
-    abbrev: Optional[str] = None
-    publisher: Optional[str] = None
-    volume: Optional[str] = None
-    issue: Optional[str] = None
-    issn: Optional[str] = None
-    eissn: Optional[str] = None
-
-
-@dataclass
-class GrobidHeader:
-    authors: List[GrobidAuthor]
-
-    title: Optional[str] = None
-    date: Optional[str] = None
-    doi: Optional[str] = None
-    journal: Optional[GrobidJournal] = None
-
-    def to_csl_dict(self, default_type: str = "article-journal") -> dict:
-
-        csl = dict(
-            type=default_type,
-            author=[a.to_csl_dict() for a in self.authors or []],
-            issued=_csl_date(self.date),
-            title=self.title,
-            DOI=self.doi,
-        )
-
-        if self.journal:
-            csl['publisher'] = self.journal.publisher
-            if self.journal.name:
-                csl['container-title'] = self.journal.name
-            if self.journal.issue and self.journal.issue.isdigit():
-                csl['issue'] = int(self.issue)
-            if self.journal.volume and self.journal.volume.isdigit():
-                csl['volume'] = int(self.volume)
-
-        return _simplify_dict(csl)
-
-
-@dataclass
 class GrobidDocument:
     grobid_version: str
     grobid_timestamp: str
-    header: GrobidHeader
+    header: GrobidBiblio
 
     pdf_md5: Optional[str] = None
     language_code: Optional[str] = None
-    citations: Optional[List[GrobidCitation]] = None
+    citations: Optional[List[GrobidBiblio]] = None
     abstract: Optional[str] = None
     body: Optional[str] = None
     acknowledgement: Optional[str] = None
@@ -203,7 +166,15 @@ class GrobidDocument:
         d = self.to_dict()
 
         # all header fields at top-level
-        d.update(d.pop('header', {}))
+        header = d.pop('header', {})
+        d['journal'] = dict(
+            name=header.pop('journal', None),
+            abbrev=header.pop('journal_abbrev', None),
+            publisher=header.pop('publisher', None),
+            issn=header.pop('issn', None),
+            issne=header.pop('issne', None),
+        )
+        d.update(header)
 
         # files not in the old schema
         d.pop('pdf_md5', None)
@@ -219,7 +190,7 @@ class GrobidDocument:
         for c in d['citations'] or []:
             for a in c['authors']:
                 a['name'] = a.pop('full_name')
-        return d
+        return _simplify_dict(d)
 
     def remove_encumbered(self) -> None:
         """
@@ -236,10 +207,7 @@ class GrobidDocument:
         Transforms in to Citation Style Language (CSL) JSON schema, as a dict
         (not an actual JSON string)
         """
-        if not self.header:
-            return {}
-        else:
-            return self.header.to_csl_dict(default_type=default_type)
+        return self.header.to_csl_dict(default_type=default_type)
 
 
 def _simplify_dict(d: dict) -> dict:
diff --git a/tests/test_csl.py b/tests/test_csl.py
index 9c8bd5f..c017488 100644
--- a/tests/test_csl.py
+++ b/tests/test_csl.py
@@ -24,6 +24,7 @@ def test_small_xml_csl() -> None:
         "issued": [[2000]],
     }
 
+    assert d.citations
     assert d.citations[0].to_csl_dict() == {
         "type": "article-journal",
         "title": "Everything is Wonderful",
diff --git a/tests/test_parse.py b/tests/test_parse.py
index 7749201..25ffa64 100644
--- a/tests/test_parse.py
+++ b/tests/test_parse.py
@@ -5,8 +5,7 @@ import xml.etree.ElementTree
 
 import pytest
 
-from grobid_tei_xml import (GrobidCitation, GrobidDocument, parse_citations_xml,
-                            parse_document_xml)
+from grobid_tei_xml import GrobidBiblio, GrobidDocument, parse_citations_xml, parse_document_xml
 from grobid_tei_xml.types import *
 
 
@@ -21,7 +20,7 @@ def test_small_xml() -> None:
         grobid_version='0.5.1-SNAPSHOT',
         grobid_timestamp='2018-04-02T00:31+0000',
         language_code='en',
-        header=GrobidHeader(
+        header=GrobidBiblio(
             title="Dummy Example File",
             authors=[
                 GrobidAuthor(full_name="Brewster Kahle",
@@ -43,14 +42,13 @@ def test_small_xml() -> None:
                     surname="Doe",
                 ),
             ],
-            journal=GrobidJournal(
-                name="Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", ),
+            journal="Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
             date="2000",
         ),
         abstract="Everything you ever wanted to know about nothing",
         body=expected_body,
         citations=[
-            GrobidCitation(
+            GrobidBiblio(
                 index=0,
                 id="b0",
                 authors=[
@@ -62,7 +60,7 @@ def test_small_xml() -> None:
                 volume="20",
                 pages="1-11",
             ),
-            GrobidCitation(
+            GrobidBiblio(
                 index=1,
                 id="b1",
                 authors=[],
author	Bryan Newbold <bnewbold@archive.org>	2021-10-22 19:25:24 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-10-22 19:25:24 -0700
commit	09668907c81492774986e11f0acd9b06090dfbe0 (patch)
tree	bbe8ebc3200ce3019aaf2620dbed55d69448fa19
parent	331288cd15d471b2ae362bf6c990f627686067d8 (diff)
download	grobid_tei_xml-09668907c81492774986e11f0acd9b06090dfbe0.tar.gz grobid_tei_xml-09668907c81492774986e11f0acd9b06090dfbe0.zip