diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-21 18:22:12 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-21 18:22:12 -0700 |
commit | 2bf52b0622005ed8a7c51e59faa9873600d9cb5f (patch) | |
tree | 6de17ab8a3f77053c4f61770011af4b7de2c4a17 /grobid_tei_xml | |
parent | 8c09c866d81854ab06b85bee6c39124c7b2faf44 (diff) | |
download | grobid_tei_xml-2bf52b0622005ed8a7c51e59faa9873600d9cb5f.tar.gz grobid_tei_xml-2bf52b0622005ed8a7c51e59faa9873600d9cb5f.zip |
more progress
Diffstat (limited to 'grobid_tei_xml')
-rw-r--r-- | grobid_tei_xml/__init__.py | 4 | ||||
-rw-r--r-- | grobid_tei_xml/__main__.py | 18 | ||||
-rw-r--r-- | grobid_tei_xml/grobid2json.py | 1 | ||||
-rw-r--r-- | grobid_tei_xml/grobid_unstructured.py | 1 | ||||
-rwxr-xr-x | grobid_tei_xml/parse.py | 34 | ||||
-rw-r--r-- | grobid_tei_xml/types.py | 22 |
6 files changed, 61 insertions, 19 deletions
diff --git a/grobid_tei_xml/__init__.py b/grobid_tei_xml/__init__.py index bf8a133..d7d4ada 100644 --- a/grobid_tei_xml/__init__.py +++ b/grobid_tei_xml/__init__.py @@ -1,5 +1,5 @@ __version__ = "0.1.0" -from .types import GrobidDocument, GrobidCitation -from .parse import parse_document_xml, parse_citations_xml from .grobid2json import teixml2json +from .parse import parse_citations_xml, parse_document_xml +from .types import GrobidCitation, GrobidDocument diff --git a/grobid_tei_xml/__main__.py b/grobid_tei_xml/__main__.py index 489bd4e..2d10e84 100644 --- a/grobid_tei_xml/__main__.py +++ b/grobid_tei_xml/__main__.py @@ -1,5 +1,8 @@ +import argparse +import json + +from . import parse_document_xml -from .parse import parse_article def main() -> None: # pragma no cover parser = argparse.ArgumentParser( @@ -19,11 +22,14 @@ def main() -> None: # pragma no cover for filename in args.teifiles: content = open(filename, "r").read() - print( - json.dumps( - parse_article(content, encumbered=(not args.no_encumbered)), - sort_keys=True, - )) + doc = parse_document_xml(content) + if args.no_encumbered: + doc.body = None + doc.annex = None + doc.acknowledgements = None + doc.abstract = None + print(json.dumps(doc.to_dict(), sort_keys=True)) + if __name__ == "__main__": # pragma no cover main() diff --git a/grobid_tei_xml/grobid2json.py b/grobid_tei_xml/grobid2json.py index c005b31..3c56b19 100644 --- a/grobid_tei_xml/grobid2json.py +++ b/grobid_tei_xml/grobid2json.py @@ -215,6 +215,7 @@ def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]: info.pop(k) return info + def transform_grobid_ref_xml(raw_xml: str) -> Optional[dict]: """ Parses GROBID XML for the case of a single reference/citation string (eg, diff --git a/grobid_tei_xml/grobid_unstructured.py b/grobid_tei_xml/grobid_unstructured.py index bdead05..cbf7322 100644 --- a/grobid_tei_xml/grobid_unstructured.py +++ b/grobid_tei_xml/grobid_unstructured.py @@ -14,4 +14,3 @@ import xml.etree.ElementTree as ET from typing import Optional from .parse import biblio_info - diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py index a239e4d..32c5d0f 100755 --- a/grobid_tei_xml/parse.py +++ b/grobid_tei_xml/parse.py @@ -1,4 +1,3 @@ - import io import json import xml.etree.ElementTree as ET @@ -20,9 +19,12 @@ def _string_to_tree(content: AnyStr) -> ET: elif isinstance(content, ET): return content else: - raise TypeError(f"expected XML as string or bytes, got: {type(content)}") + raise TypeError( + f"expected XML as string or bytes, got: {type(content)}") + -def _parse_authors(elem: Optional[ET.Element]) -> List[GrobidAffiliation]: +def _parse_authors(elem: Optional[ET.Element], + ns: str = ns) -> List[GrobidAffiliation]: if not elem: return [] names = [] @@ -64,6 +66,7 @@ def _parse_authors(elem: Optional[ET.Element]) -> List[GrobidAffiliation]: names.append(GrobidAuthor(**obj)) return names + def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation: ref: Dict[str, Any] = dict() ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") @@ -78,7 +81,7 @@ def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation: else: ref["journal"] = None ref["title"] = other_title - ref["authors"] = _parse_authors(elem) + ref["authors"] = _parse_authors(elem, ns=ns) ref["publisher"] = elem.findtext( f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") if not ref["publisher"]: @@ -117,6 +120,7 @@ def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation: ref["url"] = None return GrobidCitation(**ref) + def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal: journal = dict() journal["name"] = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") @@ -131,6 +135,7 @@ def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal: journal["abbrev"] = None return GrobidJournal(**journal) + def _parse_header(elem: ET.Element, ns: str = ns) -> GrobidHeader: header = elem info = dict() @@ -145,6 +150,7 @@ def _parse_header(elem: ET.Element, ns: str = ns) -> GrobidHeader: info["doi"] = info["doi"].lower() return GrobidHeader(**info) + def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: """ Use this function to parse TEI-XML of a full document or header processed @@ -155,7 +161,6 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: tree = _string_to_tree(xml_text) tei = tree.getroot() info = dict() - encumbered = True header = tei.find(f".//{{{ns}}}teiHeader") if header is None: @@ -188,17 +193,32 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: el = tei.find(f".//{{{ns}}}text/{{{ns}}}body") doc.body = (el or None) and " ".join(el.itertext()).strip() el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="acknowledgement"]') - doc.acknowledgement = (el or None) and " ".join( - el.itertext()).strip() + doc.acknowledgement = (el or None) and " ".join(el.itertext()).strip() el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="annex"]') doc.annex = (el or None) and " ".join(el.itertext()).strip() return doc + def parse_citations_xml(xml_text: AnyStr) -> List[GrobidCitation]: """ Use this function to parse TEI-XML of one or more references. Eg, the output of '/api/processReferences' or '/api/processCitation'. """ + # XXX: this replacement shouldn't be needed? + xml_text = xml_text.replace('xmlns="http://www.tei-c.org/ns/1.0"', "") tree = _string_to_tree(xml_text) + root = tree.getroot() + + if root.tag == 'biblStruct': + ref = _parse_citation(root, ns='') + ref.index = 0 + return [ref] + + refs = [] + for (i, bs) in enumerate(tree.findall(f".//biblStruct")): + ref = _parse_citation(bs, ns='') + ref.index = i + refs.append(ref) + return refs diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py index 795d37f..aabe424 100644 --- a/grobid_tei_xml/types.py +++ b/grobid_tei_xml/types.py @@ -1,6 +1,5 @@ - +from dataclasses import asdict, dataclass from typing import Any, AnyStr, Dict, List, Optional -from dataclasses import dataclass @dataclass @@ -11,6 +10,7 @@ class GrobidAddress: country: Optional[str] = None country_code: Optional[str] = None + @dataclass class GrobidAffiliation: address: Optional[GrobidAddress] = None @@ -18,6 +18,7 @@ class GrobidAffiliation: department: Optional[str] = None laboratory: Optional[str] = None + @dataclass class GrobidAuthor: name: Optional[str] @@ -26,6 +27,7 @@ class GrobidAuthor: surname: Optional[str] = None affiliation: Optional[dict] = None + @dataclass class GrobidCitation: authors: List[GrobidAuthor] @@ -52,6 +54,7 @@ class GrobidCitation: def to_dict(self) -> dict: return _simplify_dict(asdict(self)) + @dataclass class GrobidJournal: name: Optional[str] = None @@ -62,6 +65,7 @@ class GrobidJournal: issn: Optional[str] = None eissn: Optional[str] = None + @dataclass class GrobidHeader: title: Optional[str] = None @@ -71,6 +75,7 @@ class GrobidHeader: #TODO: note: Optional[str] journal: Optional[GrobidJournal] = None + @dataclass class GrobidDocument: grobid_version: str @@ -87,10 +92,21 @@ class GrobidDocument: def to_dict(self) -> dict: return _simplify_dict(asdict(self)) + def _simplify_dict(d: dict) -> dict: + """ + Recursively remove empty dict values from a dict and all sub-lists and + sub-dicts. + """ + if d in [None, {}, '']: + return None for k in list(d.keys()): if isinstance(d[k], dict): d[k] = _simplify_dict(d[k]) - if d[k] in [None, [], {}, '']: + elif isinstance(d[k], list): + for i in range(len(d[k])): + if isinstance(d[k][i], dict): + d[k][i] = _simplify_dict(d[k][i]) + if d[k] in [None, {}, '']: d.pop(k) return d |