From 09668907c81492774986e11f0acd9b06090dfbe0 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 22 Oct 2021 19:25:24 -0700 Subject: refactor GrobidHeader and GrobidCitation into GrobidBiblio --- grobid_tei_xml/__init__.py | 2 +- grobid_tei_xml/parse.py | 89 ++++++++++++++++++---------------------------- grobid_tei_xml/types.py | 84 ++++++++++++++----------------------------- tests/test_csl.py | 1 + tests/test_parse.py | 12 +++---- 5 files changed, 68 insertions(+), 120 deletions(-) diff --git a/grobid_tei_xml/__init__.py b/grobid_tei_xml/__init__.py index d7d4ada..9ddce29 100644 --- a/grobid_tei_xml/__init__.py +++ b/grobid_tei_xml/__init__.py @@ -2,4 +2,4 @@ __version__ = "0.1.0" from .grobid2json import teixml2json from .parse import parse_citations_xml, parse_document_xml -from .types import GrobidCitation, GrobidDocument +from .types import GrobidBiblio, GrobidDocument diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py index c65cbdf..1d7eec7 100755 --- a/grobid_tei_xml/parse.py +++ b/grobid_tei_xml/parse.py @@ -2,8 +2,7 @@ import io import xml.etree.ElementTree as ET from typing import Any, AnyStr, Dict, List, Optional -from .types import (GrobidAddress, GrobidAffiliation, GrobidAuthor, GrobidCitation, - GrobidDocument, GrobidHeader, GrobidJournal) +from .types import GrobidAddress, GrobidAffiliation, GrobidAuthor, GrobidBiblio, GrobidDocument xml_ns = "http://www.w3.org/XML/1998/namespace" ns = "http://www.tei-c.org/ns/1.0" @@ -128,80 +127,62 @@ def test_clean_url() -> None: assert row['clean'] == _clean_url(row['dirty']) -def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation: +def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio: """ - Parses an entire TEI 'biblStruct' XML tag + Parses an entire TEI 'biblStruct' or 'teiHeader' XML tag + + Could be document header or a citation. """ - citation = GrobidCitation( + biblio = GrobidBiblio( + authors=_parse_authors(elem, ns=ns), id=elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") or None, + unstructured=elem.findtext(f'.//{{{ns}}}note[@type="raw_reference"]') or None, + + # date below title=elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") or None, journal=elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") or None, - authors=_parse_authors(elem, ns=ns), - unstructured=elem.findtext(f'.//{{{ns}}}note[@type="raw_reference"]') or None, + journal_abbrev=None, # XXX + publisher=elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") or None, volume=elem.findtext(f'.//{{{ns}}}biblScope[@unit="volume"]') or None, issue=elem.findtext(f'.//{{{ns}}}biblScope[@unit="issue"]') or None, - arxiv_id=elem.findtext(f'.//{{{ns}}}idno[@type="arXiv"]') or None, + # pages below + # XXX: note doi=elem.findtext(f'.//{{{ns}}}idno[@type="DOI"]') or None, - pmcid=elem.findtext(f'.//{{{ns}}}idno[@type="PMCID"]') or None, pmid=elem.findtext(f'.//{{{ns}}}idno[@type="PMID"]') or None, + pmcid=elem.findtext(f'.//{{{ns}}}idno[@type="PMCID"]') or None, + arxiv_id=elem.findtext(f'.//{{{ns}}}idno[@type="arXiv"]') or None, + issn=elem.findtext(f'.//{{{ns}}}idno[@type="ISSN"]') or None, + eissn=elem.findtext(f'.//{{{ns}}}idno[@type="eISSN"]') or None, ) - citation.publisher = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") - if not citation.publisher: - citation.publisher = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") or None + if not biblio.publisher: + biblio.publisher = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") or None date_tag = elem.find(f'.//{{{ns}}}date[@type="published"]') if date_tag is not None: - citation.date = date_tag.attrib.get("when") or None + biblio.date = date_tag.attrib.get("when") or None # title stuff is messy in references... - if citation.journal and not citation.title: - citation.title = citation.journal - citation.journal = None + if biblio.journal and not biblio.title: + biblio.title = biblio.journal + biblio.journal = None - if citation.arxiv_id and citation.arxiv_id.startswith("arXiv:"): - citation.arxiv_id = citation.arxiv_id[6:] + if biblio.arxiv_id and biblio.arxiv_id.startswith("arXiv:"): + biblio.arxiv_id = biblio.arxiv_id[6:] el = elem.find(f'.//{{{ns}}}biblScope[@unit="page"]') if el is not None: if el.attrib.get("from") and el.attrib.get("to"): - citation.pages = "{}-{}".format(el.attrib["from"], el.attrib["to"]) + biblio.pages = "{}-{}".format(el.attrib["from"], el.attrib["to"]) else: - citation.pages = el.text + biblio.pages = el.text el = elem.find(f".//{{{ns}}}ptr[@target]") if el is not None: - citation.url = _clean_url(el.attrib["target"]) - - return citation + biblio.url = _clean_url(el.attrib["target"]) - -def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal: - journal = GrobidJournal( - name=elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") or None, - publisher=elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") or None, - issn=elem.findtext(f'.//{{{ns}}}idno[@type="ISSN"]') or None, - eissn=elem.findtext(f'.//{{{ns}}}idno[@type="eISSN"]') or None, - volume=elem.findtext(f'.//{{{ns}}}biblScope[@unit="volume"]') or None, - issue=elem.findtext(f'.//{{{ns}}}biblScope[@unit="issue"]') or None, - # XXX: abbrev - abbrev=None, - ) - return journal - - -def _parse_header(elem: ET.Element, ns: str = ns) -> GrobidHeader: - header = GrobidHeader( - title=elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") or None, - authors=_parse_authors(elem.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct")), - journal=_parse_journal(elem) or None, - doi=elem.findtext(f'.//{{{ns}}}idno[@type="DOI"]') or None, - ) - date_tag = elem.find(f'.//{{{ns}}}date[@type="published"]') - if date_tag is not None: - header.date = date_tag.attrib.get("when") or None - return header + return biblio def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: @@ -223,13 +204,13 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: doc = GrobidDocument( grobid_version=application_tag.attrib["version"].strip(), grobid_timestamp=application_tag.attrib["when"].strip(), - header=_parse_header(header), + header=_parse_biblio(header), pdf_md5=header.findtext(f'.//{{{ns}}}idno[@type="MD5"]') or None, ) refs = [] for (i, bs) in enumerate(tei.findall(f".//{{{ns}}}listBibl/{{{ns}}}biblStruct")): - ref = _parse_citation(bs) + ref = _parse_biblio(bs) ref.index = i refs.append(ref) doc.citations = refs @@ -251,7 +232,7 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: return doc -def parse_citations_xml(xml_text: AnyStr) -> List[GrobidCitation]: +def parse_citations_xml(xml_text: AnyStr) -> List[GrobidBiblio]: """ Use this function to parse TEI-XML of one or more references. This should work with either /api/processCitation or /api/processCitationList API @@ -270,13 +251,13 @@ def parse_citations_xml(xml_text: AnyStr) -> List[GrobidCitation]: root = tree.getroot() if root.tag == 'biblStruct': - ref = _parse_citation(root, ns='') + ref = _parse_biblio(root, ns='') ref.index = 0 return [ref] refs = [] for (i, bs) in enumerate(tree.findall(".//biblStruct")): - ref = _parse_citation(bs, ns='') + ref = _parse_biblio(bs, ns='') ref.index = i refs.append(ref) return refs diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py index 199e746..8356c8e 100644 --- a/grobid_tei_xml/types.py +++ b/grobid_tei_xml/types.py @@ -71,28 +71,32 @@ def test_csl_date() -> None: @dataclass -class GrobidCitation: +class GrobidBiblio: authors: List[GrobidAuthor] - index: Optional[int] = None id: Optional[str] = None + unstructured: Optional[str] = None + date: Optional[str] = None - issue: Optional[str] = None + title: Optional[str] = None journal: Optional[str] = None # XXX: venue? other? + journal_abbrev: Optional[str] = None publisher: Optional[str] = None - title: Optional[str] = None - url: Optional[str] = None + issn: Optional[str] = None + eissn: Optional[str] = None volume: Optional[str] = None + issue: Optional[str] = None pages: Optional[str] = None first_page: Optional[str] = None # XXX last_page: Optional[str] = None # XXX - unstructured: Optional[str] = None - arxiv_id: Optional[str] = None + note: Optional[str] = None + doi: Optional[str] = None pmid: Optional[str] = None pmcid: Optional[str] = None + arxiv_id: Optional[str] = None + url: Optional[str] = None oa_url: Optional[str] = None - note: Optional[str] = None def to_dict(self) -> dict: return _simplify_dict(asdict(self)) @@ -113,6 +117,7 @@ class GrobidCitation: DOI=self.doi, PMID=self.pmid, PMCID=self.pmcid, + ISSN=self.issn, note=self.note, # fields with '-' in the key name **{ @@ -129,57 +134,15 @@ class GrobidCitation: return _simplify_dict(csl) -@dataclass -class GrobidJournal: - name: Optional[str] = None - abbrev: Optional[str] = None - publisher: Optional[str] = None - volume: Optional[str] = None - issue: Optional[str] = None - issn: Optional[str] = None - eissn: Optional[str] = None - - -@dataclass -class GrobidHeader: - authors: List[GrobidAuthor] - - title: Optional[str] = None - date: Optional[str] = None - doi: Optional[str] = None - journal: Optional[GrobidJournal] = None - - def to_csl_dict(self, default_type: str = "article-journal") -> dict: - - csl = dict( - type=default_type, - author=[a.to_csl_dict() for a in self.authors or []], - issued=_csl_date(self.date), - title=self.title, - DOI=self.doi, - ) - - if self.journal: - csl['publisher'] = self.journal.publisher - if self.journal.name: - csl['container-title'] = self.journal.name - if self.journal.issue and self.journal.issue.isdigit(): - csl['issue'] = int(self.issue) - if self.journal.volume and self.journal.volume.isdigit(): - csl['volume'] = int(self.volume) - - return _simplify_dict(csl) - - @dataclass class GrobidDocument: grobid_version: str grobid_timestamp: str - header: GrobidHeader + header: GrobidBiblio pdf_md5: Optional[str] = None language_code: Optional[str] = None - citations: Optional[List[GrobidCitation]] = None + citations: Optional[List[GrobidBiblio]] = None abstract: Optional[str] = None body: Optional[str] = None acknowledgement: Optional[str] = None @@ -203,7 +166,15 @@ class GrobidDocument: d = self.to_dict() # all header fields at top-level - d.update(d.pop('header', {})) + header = d.pop('header', {}) + d['journal'] = dict( + name=header.pop('journal', None), + abbrev=header.pop('journal_abbrev', None), + publisher=header.pop('publisher', None), + issn=header.pop('issn', None), + issne=header.pop('issne', None), + ) + d.update(header) # files not in the old schema d.pop('pdf_md5', None) @@ -219,7 +190,7 @@ class GrobidDocument: for c in d['citations'] or []: for a in c['authors']: a['name'] = a.pop('full_name') - return d + return _simplify_dict(d) def remove_encumbered(self) -> None: """ @@ -236,10 +207,7 @@ class GrobidDocument: Transforms in to Citation Style Language (CSL) JSON schema, as a dict (not an actual JSON string) """ - if not self.header: - return {} - else: - return self.header.to_csl_dict(default_type=default_type) + return self.header.to_csl_dict(default_type=default_type) def _simplify_dict(d: dict) -> dict: diff --git a/tests/test_csl.py b/tests/test_csl.py index 9c8bd5f..c017488 100644 --- a/tests/test_csl.py +++ b/tests/test_csl.py @@ -24,6 +24,7 @@ def test_small_xml_csl() -> None: "issued": [[2000]], } + assert d.citations assert d.citations[0].to_csl_dict() == { "type": "article-journal", "title": "Everything is Wonderful", diff --git a/tests/test_parse.py b/tests/test_parse.py index 7749201..25ffa64 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -5,8 +5,7 @@ import xml.etree.ElementTree import pytest -from grobid_tei_xml import (GrobidCitation, GrobidDocument, parse_citations_xml, - parse_document_xml) +from grobid_tei_xml import GrobidBiblio, GrobidDocument, parse_citations_xml, parse_document_xml from grobid_tei_xml.types import * @@ -21,7 +20,7 @@ def test_small_xml() -> None: grobid_version='0.5.1-SNAPSHOT', grobid_timestamp='2018-04-02T00:31+0000', language_code='en', - header=GrobidHeader( + header=GrobidBiblio( title="Dummy Example File", authors=[ GrobidAuthor(full_name="Brewster Kahle", @@ -43,14 +42,13 @@ def test_small_xml() -> None: surname="Doe", ), ], - journal=GrobidJournal( - name="Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", ), + journal="Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", date="2000", ), abstract="Everything you ever wanted to know about nothing", body=expected_body, citations=[ - GrobidCitation( + GrobidBiblio( index=0, id="b0", authors=[ @@ -62,7 +60,7 @@ def test_small_xml() -> None: volume="20", pages="1-11", ), - GrobidCitation( + GrobidBiblio( index=1, id="b1", authors=[], -- cgit v1.2.3