aboutsummaryrefslogtreecommitdiffstats
path: root/grobid_tei_xml/parse.py
diff options
context:
space:
mode:
Diffstat (limited to 'grobid_tei_xml/parse.py')
-rwxr-xr-xgrobid_tei_xml/parse.py89
1 files changed, 35 insertions, 54 deletions
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py
index c65cbdf..1d7eec7 100755
--- a/grobid_tei_xml/parse.py
+++ b/grobid_tei_xml/parse.py
@@ -2,8 +2,7 @@ import io
import xml.etree.ElementTree as ET
from typing import Any, AnyStr, Dict, List, Optional
-from .types import (GrobidAddress, GrobidAffiliation, GrobidAuthor, GrobidCitation,
- GrobidDocument, GrobidHeader, GrobidJournal)
+from .types import GrobidAddress, GrobidAffiliation, GrobidAuthor, GrobidBiblio, GrobidDocument
xml_ns = "http://www.w3.org/XML/1998/namespace"
ns = "http://www.tei-c.org/ns/1.0"
@@ -128,80 +127,62 @@ def test_clean_url() -> None:
assert row['clean'] == _clean_url(row['dirty'])
-def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation:
+def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio:
"""
- Parses an entire TEI 'biblStruct' XML tag
+ Parses an entire TEI 'biblStruct' or 'teiHeader' XML tag
+
+ Could be document header or a citation.
"""
- citation = GrobidCitation(
+ biblio = GrobidBiblio(
+ authors=_parse_authors(elem, ns=ns),
id=elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") or None,
+ unstructured=elem.findtext(f'.//{{{ns}}}note[@type="raw_reference"]') or None,
+
+ # date below
title=elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") or None,
journal=elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") or None,
- authors=_parse_authors(elem, ns=ns),
- unstructured=elem.findtext(f'.//{{{ns}}}note[@type="raw_reference"]') or None,
+ journal_abbrev=None, # XXX
+ publisher=elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") or None,
volume=elem.findtext(f'.//{{{ns}}}biblScope[@unit="volume"]') or None,
issue=elem.findtext(f'.//{{{ns}}}biblScope[@unit="issue"]') or None,
- arxiv_id=elem.findtext(f'.//{{{ns}}}idno[@type="arXiv"]') or None,
+ # pages below
+ # XXX: note
doi=elem.findtext(f'.//{{{ns}}}idno[@type="DOI"]') or None,
- pmcid=elem.findtext(f'.//{{{ns}}}idno[@type="PMCID"]') or None,
pmid=elem.findtext(f'.//{{{ns}}}idno[@type="PMID"]') or None,
+ pmcid=elem.findtext(f'.//{{{ns}}}idno[@type="PMCID"]') or None,
+ arxiv_id=elem.findtext(f'.//{{{ns}}}idno[@type="arXiv"]') or None,
+ issn=elem.findtext(f'.//{{{ns}}}idno[@type="ISSN"]') or None,
+ eissn=elem.findtext(f'.//{{{ns}}}idno[@type="eISSN"]') or None,
)
- citation.publisher = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher")
- if not citation.publisher:
- citation.publisher = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") or None
+ if not biblio.publisher:
+ biblio.publisher = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") or None
date_tag = elem.find(f'.//{{{ns}}}date[@type="published"]')
if date_tag is not None:
- citation.date = date_tag.attrib.get("when") or None
+ biblio.date = date_tag.attrib.get("when") or None
# title stuff is messy in references...
- if citation.journal and not citation.title:
- citation.title = citation.journal
- citation.journal = None
+ if biblio.journal and not biblio.title:
+ biblio.title = biblio.journal
+ biblio.journal = None
- if citation.arxiv_id and citation.arxiv_id.startswith("arXiv:"):
- citation.arxiv_id = citation.arxiv_id[6:]
+ if biblio.arxiv_id and biblio.arxiv_id.startswith("arXiv:"):
+ biblio.arxiv_id = biblio.arxiv_id[6:]
el = elem.find(f'.//{{{ns}}}biblScope[@unit="page"]')
if el is not None:
if el.attrib.get("from") and el.attrib.get("to"):
- citation.pages = "{}-{}".format(el.attrib["from"], el.attrib["to"])
+ biblio.pages = "{}-{}".format(el.attrib["from"], el.attrib["to"])
else:
- citation.pages = el.text
+ biblio.pages = el.text
el = elem.find(f".//{{{ns}}}ptr[@target]")
if el is not None:
- citation.url = _clean_url(el.attrib["target"])
-
- return citation
+ biblio.url = _clean_url(el.attrib["target"])
-
-def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal:
- journal = GrobidJournal(
- name=elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") or None,
- publisher=elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") or None,
- issn=elem.findtext(f'.//{{{ns}}}idno[@type="ISSN"]') or None,
- eissn=elem.findtext(f'.//{{{ns}}}idno[@type="eISSN"]') or None,
- volume=elem.findtext(f'.//{{{ns}}}biblScope[@unit="volume"]') or None,
- issue=elem.findtext(f'.//{{{ns}}}biblScope[@unit="issue"]') or None,
- # XXX: abbrev
- abbrev=None,
- )
- return journal
-
-
-def _parse_header(elem: ET.Element, ns: str = ns) -> GrobidHeader:
- header = GrobidHeader(
- title=elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") or None,
- authors=_parse_authors(elem.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct")),
- journal=_parse_journal(elem) or None,
- doi=elem.findtext(f'.//{{{ns}}}idno[@type="DOI"]') or None,
- )
- date_tag = elem.find(f'.//{{{ns}}}date[@type="published"]')
- if date_tag is not None:
- header.date = date_tag.attrib.get("when") or None
- return header
+ return biblio
def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
@@ -223,13 +204,13 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
doc = GrobidDocument(
grobid_version=application_tag.attrib["version"].strip(),
grobid_timestamp=application_tag.attrib["when"].strip(),
- header=_parse_header(header),
+ header=_parse_biblio(header),
pdf_md5=header.findtext(f'.//{{{ns}}}idno[@type="MD5"]') or None,
)
refs = []
for (i, bs) in enumerate(tei.findall(f".//{{{ns}}}listBibl/{{{ns}}}biblStruct")):
- ref = _parse_citation(bs)
+ ref = _parse_biblio(bs)
ref.index = i
refs.append(ref)
doc.citations = refs
@@ -251,7 +232,7 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
return doc
-def parse_citations_xml(xml_text: AnyStr) -> List[GrobidCitation]:
+def parse_citations_xml(xml_text: AnyStr) -> List[GrobidBiblio]:
"""
Use this function to parse TEI-XML of one or more references. This should
work with either /api/processCitation or /api/processCitationList API
@@ -270,13 +251,13 @@ def parse_citations_xml(xml_text: AnyStr) -> List[GrobidCitation]:
root = tree.getroot()
if root.tag == 'biblStruct':
- ref = _parse_citation(root, ns='')
+ ref = _parse_biblio(root, ns='')
ref.index = 0
return [ref]
refs = []
for (i, bs) in enumerate(tree.findall(".//biblStruct")):
- ref = _parse_citation(bs, ns='')
+ ref = _parse_biblio(bs, ns='')
ref.index = i
refs.append(ref)
return refs