aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-22 19:25:24 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-22 19:25:24 -0700
commit09668907c81492774986e11f0acd9b06090dfbe0 (patch)
treebbe8ebc3200ce3019aaf2620dbed55d69448fa19
parent331288cd15d471b2ae362bf6c990f627686067d8 (diff)
downloadgrobid_tei_xml-09668907c81492774986e11f0acd9b06090dfbe0.tar.gz
grobid_tei_xml-09668907c81492774986e11f0acd9b06090dfbe0.zip
refactor GrobidHeader and GrobidCitation into GrobidBiblio
-rw-r--r--grobid_tei_xml/__init__.py2
-rwxr-xr-xgrobid_tei_xml/parse.py89
-rw-r--r--grobid_tei_xml/types.py84
-rw-r--r--tests/test_csl.py1
-rw-r--r--tests/test_parse.py12
5 files changed, 68 insertions, 120 deletions
diff --git a/grobid_tei_xml/__init__.py b/grobid_tei_xml/__init__.py
index d7d4ada..9ddce29 100644
--- a/grobid_tei_xml/__init__.py
+++ b/grobid_tei_xml/__init__.py
@@ -2,4 +2,4 @@ __version__ = "0.1.0"
from .grobid2json import teixml2json
from .parse import parse_citations_xml, parse_document_xml
-from .types import GrobidCitation, GrobidDocument
+from .types import GrobidBiblio, GrobidDocument
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py
index c65cbdf..1d7eec7 100755
--- a/grobid_tei_xml/parse.py
+++ b/grobid_tei_xml/parse.py
@@ -2,8 +2,7 @@ import io
import xml.etree.ElementTree as ET
from typing import Any, AnyStr, Dict, List, Optional
-from .types import (GrobidAddress, GrobidAffiliation, GrobidAuthor, GrobidCitation,
- GrobidDocument, GrobidHeader, GrobidJournal)
+from .types import GrobidAddress, GrobidAffiliation, GrobidAuthor, GrobidBiblio, GrobidDocument
xml_ns = "http://www.w3.org/XML/1998/namespace"
ns = "http://www.tei-c.org/ns/1.0"
@@ -128,80 +127,62 @@ def test_clean_url() -> None:
assert row['clean'] == _clean_url(row['dirty'])
-def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation:
+def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio:
"""
- Parses an entire TEI 'biblStruct' XML tag
+ Parses an entire TEI 'biblStruct' or 'teiHeader' XML tag
+
+ Could be document header or a citation.
"""
- citation = GrobidCitation(
+ biblio = GrobidBiblio(
+ authors=_parse_authors(elem, ns=ns),
id=elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") or None,
+ unstructured=elem.findtext(f'.//{{{ns}}}note[@type="raw_reference"]') or None,
+
+ # date below
title=elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") or None,
journal=elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") or None,
- authors=_parse_authors(elem, ns=ns),
- unstructured=elem.findtext(f'.//{{{ns}}}note[@type="raw_reference"]') or None,
+ journal_abbrev=None, # XXX
+ publisher=elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") or None,
volume=elem.findtext(f'.//{{{ns}}}biblScope[@unit="volume"]') or None,
issue=elem.findtext(f'.//{{{ns}}}biblScope[@unit="issue"]') or None,
- arxiv_id=elem.findtext(f'.//{{{ns}}}idno[@type="arXiv"]') or None,
+ # pages below
+ # XXX: note
doi=elem.findtext(f'.//{{{ns}}}idno[@type="DOI"]') or None,
- pmcid=elem.findtext(f'.//{{{ns}}}idno[@type="PMCID"]') or None,
pmid=elem.findtext(f'.//{{{ns}}}idno[@type="PMID"]') or None,
+ pmcid=elem.findtext(f'.//{{{ns}}}idno[@type="PMCID"]') or None,
+ arxiv_id=elem.findtext(f'.//{{{ns}}}idno[@type="arXiv"]') or None,
+ issn=elem.findtext(f'.//{{{ns}}}idno[@type="ISSN"]') or None,
+ eissn=elem.findtext(f'.//{{{ns}}}idno[@type="eISSN"]') or None,
)
- citation.publisher = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher")
- if not citation.publisher:
- citation.publisher = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") or None
+ if not biblio.publisher:
+ biblio.publisher = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") or None
date_tag = elem.find(f'.//{{{ns}}}date[@type="published"]')
if date_tag is not None:
- citation.date = date_tag.attrib.get("when") or None
+ biblio.date = date_tag.attrib.get("when") or None
# title stuff is messy in references...
- if citation.journal and not citation.title:
- citation.title = citation.journal
- citation.journal = None
+ if biblio.journal and not biblio.title:
+ biblio.title = biblio.journal
+ biblio.journal = None
- if citation.arxiv_id and citation.arxiv_id.startswith("arXiv:"):
- citation.arxiv_id = citation.arxiv_id[6:]
+ if biblio.arxiv_id and biblio.arxiv_id.startswith("arXiv:"):
+ biblio.arxiv_id = biblio.arxiv_id[6:]
el = elem.find(f'.//{{{ns}}}biblScope[@unit="page"]')
if el is not None:
if el.attrib.get("from") and el.attrib.get("to"):
- citation.pages = "{}-{}".format(el.attrib["from"], el.attrib["to"])
+ biblio.pages = "{}-{}".format(el.attrib["from"], el.attrib["to"])
else:
- citation.pages = el.text
+ biblio.pages = el.text
el = elem.find(f".//{{{ns}}}ptr[@target]")
if el is not None:
- citation.url = _clean_url(el.attrib["target"])
-
- return citation
+ biblio.url = _clean_url(el.attrib["target"])
-
-def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal:
- journal = GrobidJournal(
- name=elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") or None,
- publisher=elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") or None,
- issn=elem.findtext(f'.//{{{ns}}}idno[@type="ISSN"]') or None,
- eissn=elem.findtext(f'.//{{{ns}}}idno[@type="eISSN"]') or None,
- volume=elem.findtext(f'.//{{{ns}}}biblScope[@unit="volume"]') or None,
- issue=elem.findtext(f'.//{{{ns}}}biblScope[@unit="issue"]') or None,
- # XXX: abbrev
- abbrev=None,
- )
- return journal
-
-
-def _parse_header(elem: ET.Element, ns: str = ns) -> GrobidHeader:
- header = GrobidHeader(
- title=elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") or None,
- authors=_parse_authors(elem.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct")),
- journal=_parse_journal(elem) or None,
- doi=elem.findtext(f'.//{{{ns}}}idno[@type="DOI"]') or None,
- )
- date_tag = elem.find(f'.//{{{ns}}}date[@type="published"]')
- if date_tag is not None:
- header.date = date_tag.attrib.get("when") or None
- return header
+ return biblio
def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
@@ -223,13 +204,13 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
doc = GrobidDocument(
grobid_version=application_tag.attrib["version"].strip(),
grobid_timestamp=application_tag.attrib["when"].strip(),
- header=_parse_header(header),
+ header=_parse_biblio(header),
pdf_md5=header.findtext(f'.//{{{ns}}}idno[@type="MD5"]') or None,
)
refs = []
for (i, bs) in enumerate(tei.findall(f".//{{{ns}}}listBibl/{{{ns}}}biblStruct")):
- ref = _parse_citation(bs)
+ ref = _parse_biblio(bs)
ref.index = i
refs.append(ref)
doc.citations = refs
@@ -251,7 +232,7 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
return doc
-def parse_citations_xml(xml_text: AnyStr) -> List[GrobidCitation]:
+def parse_citations_xml(xml_text: AnyStr) -> List[GrobidBiblio]:
"""
Use this function to parse TEI-XML of one or more references. This should
work with either /api/processCitation or /api/processCitationList API
@@ -270,13 +251,13 @@ def parse_citations_xml(xml_text: AnyStr) -> List[GrobidCitation]:
root = tree.getroot()
if root.tag == 'biblStruct':
- ref = _parse_citation(root, ns='')
+ ref = _parse_biblio(root, ns='')
ref.index = 0
return [ref]
refs = []
for (i, bs) in enumerate(tree.findall(".//biblStruct")):
- ref = _parse_citation(bs, ns='')
+ ref = _parse_biblio(bs, ns='')
ref.index = i
refs.append(ref)
return refs
diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py
index 199e746..8356c8e 100644
--- a/grobid_tei_xml/types.py
+++ b/grobid_tei_xml/types.py
@@ -71,28 +71,32 @@ def test_csl_date() -> None:
@dataclass
-class GrobidCitation:
+class GrobidBiblio:
authors: List[GrobidAuthor]
-
index: Optional[int] = None
id: Optional[str] = None
+ unstructured: Optional[str] = None
+
date: Optional[str] = None
- issue: Optional[str] = None
+ title: Optional[str] = None
journal: Optional[str] = None # XXX: venue? other?
+ journal_abbrev: Optional[str] = None
publisher: Optional[str] = None
- title: Optional[str] = None
- url: Optional[str] = None
+ issn: Optional[str] = None
+ eissn: Optional[str] = None
volume: Optional[str] = None
+ issue: Optional[str] = None
pages: Optional[str] = None
first_page: Optional[str] = None # XXX
last_page: Optional[str] = None # XXX
- unstructured: Optional[str] = None
- arxiv_id: Optional[str] = None
+ note: Optional[str] = None
+
doi: Optional[str] = None
pmid: Optional[str] = None
pmcid: Optional[str] = None
+ arxiv_id: Optional[str] = None
+ url: Optional[str] = None
oa_url: Optional[str] = None
- note: Optional[str] = None
def to_dict(self) -> dict:
return _simplify_dict(asdict(self))
@@ -113,6 +117,7 @@ class GrobidCitation:
DOI=self.doi,
PMID=self.pmid,
PMCID=self.pmcid,
+ ISSN=self.issn,
note=self.note,
# fields with '-' in the key name
**{
@@ -130,56 +135,14 @@ class GrobidCitation:
@dataclass
-class GrobidJournal:
- name: Optional[str] = None
- abbrev: Optional[str] = None
- publisher: Optional[str] = None
- volume: Optional[str] = None
- issue: Optional[str] = None
- issn: Optional[str] = None
- eissn: Optional[str] = None
-
-
-@dataclass
-class GrobidHeader:
- authors: List[GrobidAuthor]
-
- title: Optional[str] = None
- date: Optional[str] = None
- doi: Optional[str] = None
- journal: Optional[GrobidJournal] = None
-
- def to_csl_dict(self, default_type: str = "article-journal") -> dict:
-
- csl = dict(
- type=default_type,
- author=[a.to_csl_dict() for a in self.authors or []],
- issued=_csl_date(self.date),
- title=self.title,
- DOI=self.doi,
- )
-
- if self.journal:
- csl['publisher'] = self.journal.publisher
- if self.journal.name:
- csl['container-title'] = self.journal.name
- if self.journal.issue and self.journal.issue.isdigit():
- csl['issue'] = int(self.issue)
- if self.journal.volume and self.journal.volume.isdigit():
- csl['volume'] = int(self.volume)
-
- return _simplify_dict(csl)
-
-
-@dataclass
class GrobidDocument:
grobid_version: str
grobid_timestamp: str
- header: GrobidHeader
+ header: GrobidBiblio
pdf_md5: Optional[str] = None
language_code: Optional[str] = None
- citations: Optional[List[GrobidCitation]] = None
+ citations: Optional[List[GrobidBiblio]] = None
abstract: Optional[str] = None
body: Optional[str] = None
acknowledgement: Optional[str] = None
@@ -203,7 +166,15 @@ class GrobidDocument:
d = self.to_dict()
# all header fields at top-level
- d.update(d.pop('header', {}))
+ header = d.pop('header', {})
+ d['journal'] = dict(
+ name=header.pop('journal', None),
+ abbrev=header.pop('journal_abbrev', None),
+ publisher=header.pop('publisher', None),
+ issn=header.pop('issn', None),
+ issne=header.pop('issne', None),
+ )
+ d.update(header)
# files not in the old schema
d.pop('pdf_md5', None)
@@ -219,7 +190,7 @@ class GrobidDocument:
for c in d['citations'] or []:
for a in c['authors']:
a['name'] = a.pop('full_name')
- return d
+ return _simplify_dict(d)
def remove_encumbered(self) -> None:
"""
@@ -236,10 +207,7 @@ class GrobidDocument:
Transforms in to Citation Style Language (CSL) JSON schema, as a dict
(not an actual JSON string)
"""
- if not self.header:
- return {}
- else:
- return self.header.to_csl_dict(default_type=default_type)
+ return self.header.to_csl_dict(default_type=default_type)
def _simplify_dict(d: dict) -> dict:
diff --git a/tests/test_csl.py b/tests/test_csl.py
index 9c8bd5f..c017488 100644
--- a/tests/test_csl.py
+++ b/tests/test_csl.py
@@ -24,6 +24,7 @@ def test_small_xml_csl() -> None:
"issued": [[2000]],
}
+ assert d.citations
assert d.citations[0].to_csl_dict() == {
"type": "article-journal",
"title": "Everything is Wonderful",
diff --git a/tests/test_parse.py b/tests/test_parse.py
index 7749201..25ffa64 100644
--- a/tests/test_parse.py
+++ b/tests/test_parse.py
@@ -5,8 +5,7 @@ import xml.etree.ElementTree
import pytest
-from grobid_tei_xml import (GrobidCitation, GrobidDocument, parse_citations_xml,
- parse_document_xml)
+from grobid_tei_xml import GrobidBiblio, GrobidDocument, parse_citations_xml, parse_document_xml
from grobid_tei_xml.types import *
@@ -21,7 +20,7 @@ def test_small_xml() -> None:
grobid_version='0.5.1-SNAPSHOT',
grobid_timestamp='2018-04-02T00:31+0000',
language_code='en',
- header=GrobidHeader(
+ header=GrobidBiblio(
title="Dummy Example File",
authors=[
GrobidAuthor(full_name="Brewster Kahle",
@@ -43,14 +42,13 @@ def test_small_xml() -> None:
surname="Doe",
),
],
- journal=GrobidJournal(
- name="Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", ),
+ journal="Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
date="2000",
),
abstract="Everything you ever wanted to know about nothing",
body=expected_body,
citations=[
- GrobidCitation(
+ GrobidBiblio(
index=0,
id="b0",
authors=[
@@ -62,7 +60,7 @@ def test_small_xml() -> None:
volume="20",
pages="1-11",
),
- GrobidCitation(
+ GrobidBiblio(
index=1,
id="b1",
authors=[],