diff options
Diffstat (limited to 'grobid_tei_xml/parse.py')
-rwxr-xr-x | grobid_tei_xml/parse.py | 89 |
1 files changed, 35 insertions, 54 deletions
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py index c65cbdf..1d7eec7 100755 --- a/grobid_tei_xml/parse.py +++ b/grobid_tei_xml/parse.py @@ -2,8 +2,7 @@ import io import xml.etree.ElementTree as ET from typing import Any, AnyStr, Dict, List, Optional -from .types import (GrobidAddress, GrobidAffiliation, GrobidAuthor, GrobidCitation, - GrobidDocument, GrobidHeader, GrobidJournal) +from .types import GrobidAddress, GrobidAffiliation, GrobidAuthor, GrobidBiblio, GrobidDocument xml_ns = "http://www.w3.org/XML/1998/namespace" ns = "http://www.tei-c.org/ns/1.0" @@ -128,80 +127,62 @@ def test_clean_url() -> None: assert row['clean'] == _clean_url(row['dirty']) -def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation: +def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio: """ - Parses an entire TEI 'biblStruct' XML tag + Parses an entire TEI 'biblStruct' or 'teiHeader' XML tag + + Could be document header or a citation. """ - citation = GrobidCitation( + biblio = GrobidBiblio( + authors=_parse_authors(elem, ns=ns), id=elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") or None, + unstructured=elem.findtext(f'.//{{{ns}}}note[@type="raw_reference"]') or None, + + # date below title=elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") or None, journal=elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") or None, - authors=_parse_authors(elem, ns=ns), - unstructured=elem.findtext(f'.//{{{ns}}}note[@type="raw_reference"]') or None, + journal_abbrev=None, # XXX + publisher=elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") or None, volume=elem.findtext(f'.//{{{ns}}}biblScope[@unit="volume"]') or None, issue=elem.findtext(f'.//{{{ns}}}biblScope[@unit="issue"]') or None, - arxiv_id=elem.findtext(f'.//{{{ns}}}idno[@type="arXiv"]') or None, + # pages below + # XXX: note doi=elem.findtext(f'.//{{{ns}}}idno[@type="DOI"]') or None, - pmcid=elem.findtext(f'.//{{{ns}}}idno[@type="PMCID"]') or None, pmid=elem.findtext(f'.//{{{ns}}}idno[@type="PMID"]') or None, + pmcid=elem.findtext(f'.//{{{ns}}}idno[@type="PMCID"]') or None, + arxiv_id=elem.findtext(f'.//{{{ns}}}idno[@type="arXiv"]') or None, + issn=elem.findtext(f'.//{{{ns}}}idno[@type="ISSN"]') or None, + eissn=elem.findtext(f'.//{{{ns}}}idno[@type="eISSN"]') or None, ) - citation.publisher = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") - if not citation.publisher: - citation.publisher = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") or None + if not biblio.publisher: + biblio.publisher = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") or None date_tag = elem.find(f'.//{{{ns}}}date[@type="published"]') if date_tag is not None: - citation.date = date_tag.attrib.get("when") or None + biblio.date = date_tag.attrib.get("when") or None # title stuff is messy in references... - if citation.journal and not citation.title: - citation.title = citation.journal - citation.journal = None + if biblio.journal and not biblio.title: + biblio.title = biblio.journal + biblio.journal = None - if citation.arxiv_id and citation.arxiv_id.startswith("arXiv:"): - citation.arxiv_id = citation.arxiv_id[6:] + if biblio.arxiv_id and biblio.arxiv_id.startswith("arXiv:"): + biblio.arxiv_id = biblio.arxiv_id[6:] el = elem.find(f'.//{{{ns}}}biblScope[@unit="page"]') if el is not None: if el.attrib.get("from") and el.attrib.get("to"): - citation.pages = "{}-{}".format(el.attrib["from"], el.attrib["to"]) + biblio.pages = "{}-{}".format(el.attrib["from"], el.attrib["to"]) else: - citation.pages = el.text + biblio.pages = el.text el = elem.find(f".//{{{ns}}}ptr[@target]") if el is not None: - citation.url = _clean_url(el.attrib["target"]) - - return citation + biblio.url = _clean_url(el.attrib["target"]) - -def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal: - journal = GrobidJournal( - name=elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") or None, - publisher=elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") or None, - issn=elem.findtext(f'.//{{{ns}}}idno[@type="ISSN"]') or None, - eissn=elem.findtext(f'.//{{{ns}}}idno[@type="eISSN"]') or None, - volume=elem.findtext(f'.//{{{ns}}}biblScope[@unit="volume"]') or None, - issue=elem.findtext(f'.//{{{ns}}}biblScope[@unit="issue"]') or None, - # XXX: abbrev - abbrev=None, - ) - return journal - - -def _parse_header(elem: ET.Element, ns: str = ns) -> GrobidHeader: - header = GrobidHeader( - title=elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") or None, - authors=_parse_authors(elem.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct")), - journal=_parse_journal(elem) or None, - doi=elem.findtext(f'.//{{{ns}}}idno[@type="DOI"]') or None, - ) - date_tag = elem.find(f'.//{{{ns}}}date[@type="published"]') - if date_tag is not None: - header.date = date_tag.attrib.get("when") or None - return header + return biblio def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: @@ -223,13 +204,13 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: doc = GrobidDocument( grobid_version=application_tag.attrib["version"].strip(), grobid_timestamp=application_tag.attrib["when"].strip(), - header=_parse_header(header), + header=_parse_biblio(header), pdf_md5=header.findtext(f'.//{{{ns}}}idno[@type="MD5"]') or None, ) refs = [] for (i, bs) in enumerate(tei.findall(f".//{{{ns}}}listBibl/{{{ns}}}biblStruct")): - ref = _parse_citation(bs) + ref = _parse_biblio(bs) ref.index = i refs.append(ref) doc.citations = refs @@ -251,7 +232,7 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: return doc -def parse_citations_xml(xml_text: AnyStr) -> List[GrobidCitation]: +def parse_citations_xml(xml_text: AnyStr) -> List[GrobidBiblio]: """ Use this function to parse TEI-XML of one or more references. This should work with either /api/processCitation or /api/processCitationList API @@ -270,13 +251,13 @@ def parse_citations_xml(xml_text: AnyStr) -> List[GrobidCitation]: root = tree.getroot() if root.tag == 'biblStruct': - ref = _parse_citation(root, ns='') + ref = _parse_biblio(root, ns='') ref.index = 0 return [ref] refs = [] for (i, bs) in enumerate(tree.findall(".//biblStruct")): - ref = _parse_citation(bs, ns='') + ref = _parse_biblio(bs, ns='') ref.index = i refs.append(ref) return refs |