aboutsummaryrefslogtreecommitdiffstats
path: root/grobid_tei_xml
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-21 18:22:12 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-21 18:22:12 -0700
commit2bf52b0622005ed8a7c51e59faa9873600d9cb5f (patch)
tree6de17ab8a3f77053c4f61770011af4b7de2c4a17 /grobid_tei_xml
parent8c09c866d81854ab06b85bee6c39124c7b2faf44 (diff)
downloadgrobid_tei_xml-2bf52b0622005ed8a7c51e59faa9873600d9cb5f.tar.gz
grobid_tei_xml-2bf52b0622005ed8a7c51e59faa9873600d9cb5f.zip
more progress
Diffstat (limited to 'grobid_tei_xml')
-rw-r--r--grobid_tei_xml/__init__.py4
-rw-r--r--grobid_tei_xml/__main__.py18
-rw-r--r--grobid_tei_xml/grobid2json.py1
-rw-r--r--grobid_tei_xml/grobid_unstructured.py1
-rwxr-xr-xgrobid_tei_xml/parse.py34
-rw-r--r--grobid_tei_xml/types.py22
6 files changed, 61 insertions, 19 deletions
diff --git a/grobid_tei_xml/__init__.py b/grobid_tei_xml/__init__.py
index bf8a133..d7d4ada 100644
--- a/grobid_tei_xml/__init__.py
+++ b/grobid_tei_xml/__init__.py
@@ -1,5 +1,5 @@
__version__ = "0.1.0"
-from .types import GrobidDocument, GrobidCitation
-from .parse import parse_document_xml, parse_citations_xml
from .grobid2json import teixml2json
+from .parse import parse_citations_xml, parse_document_xml
+from .types import GrobidCitation, GrobidDocument
diff --git a/grobid_tei_xml/__main__.py b/grobid_tei_xml/__main__.py
index 489bd4e..2d10e84 100644
--- a/grobid_tei_xml/__main__.py
+++ b/grobid_tei_xml/__main__.py
@@ -1,5 +1,8 @@
+import argparse
+import json
+
+from . import parse_document_xml
-from .parse import parse_article
def main() -> None: # pragma no cover
parser = argparse.ArgumentParser(
@@ -19,11 +22,14 @@ def main() -> None: # pragma no cover
for filename in args.teifiles:
content = open(filename, "r").read()
- print(
- json.dumps(
- parse_article(content, encumbered=(not args.no_encumbered)),
- sort_keys=True,
- ))
+ doc = parse_document_xml(content)
+ if args.no_encumbered:
+ doc.body = None
+ doc.annex = None
+ doc.acknowledgements = None
+ doc.abstract = None
+ print(json.dumps(doc.to_dict(), sort_keys=True))
+
if __name__ == "__main__": # pragma no cover
main()
diff --git a/grobid_tei_xml/grobid2json.py b/grobid_tei_xml/grobid2json.py
index c005b31..3c56b19 100644
--- a/grobid_tei_xml/grobid2json.py
+++ b/grobid_tei_xml/grobid2json.py
@@ -215,6 +215,7 @@ def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]:
info.pop(k)
return info
+
def transform_grobid_ref_xml(raw_xml: str) -> Optional[dict]:
"""
Parses GROBID XML for the case of a single reference/citation string (eg,
diff --git a/grobid_tei_xml/grobid_unstructured.py b/grobid_tei_xml/grobid_unstructured.py
index bdead05..cbf7322 100644
--- a/grobid_tei_xml/grobid_unstructured.py
+++ b/grobid_tei_xml/grobid_unstructured.py
@@ -14,4 +14,3 @@ import xml.etree.ElementTree as ET
from typing import Optional
from .parse import biblio_info
-
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py
index a239e4d..32c5d0f 100755
--- a/grobid_tei_xml/parse.py
+++ b/grobid_tei_xml/parse.py
@@ -1,4 +1,3 @@
-
import io
import json
import xml.etree.ElementTree as ET
@@ -20,9 +19,12 @@ def _string_to_tree(content: AnyStr) -> ET:
elif isinstance(content, ET):
return content
else:
- raise TypeError(f"expected XML as string or bytes, got: {type(content)}")
+ raise TypeError(
+ f"expected XML as string or bytes, got: {type(content)}")
+
-def _parse_authors(elem: Optional[ET.Element]) -> List[GrobidAffiliation]:
+def _parse_authors(elem: Optional[ET.Element],
+ ns: str = ns) -> List[GrobidAffiliation]:
if not elem:
return []
names = []
@@ -64,6 +66,7 @@ def _parse_authors(elem: Optional[ET.Element]) -> List[GrobidAffiliation]:
names.append(GrobidAuthor(**obj))
return names
+
def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation:
ref: Dict[str, Any] = dict()
ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id")
@@ -78,7 +81,7 @@ def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation:
else:
ref["journal"] = None
ref["title"] = other_title
- ref["authors"] = _parse_authors(elem)
+ ref["authors"] = _parse_authors(elem, ns=ns)
ref["publisher"] = elem.findtext(
f".//{{{ns}}}publicationStmt/{{{ns}}}publisher")
if not ref["publisher"]:
@@ -117,6 +120,7 @@ def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation:
ref["url"] = None
return GrobidCitation(**ref)
+
def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal:
journal = dict()
journal["name"] = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title")
@@ -131,6 +135,7 @@ def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal:
journal["abbrev"] = None
return GrobidJournal(**journal)
+
def _parse_header(elem: ET.Element, ns: str = ns) -> GrobidHeader:
header = elem
info = dict()
@@ -145,6 +150,7 @@ def _parse_header(elem: ET.Element, ns: str = ns) -> GrobidHeader:
info["doi"] = info["doi"].lower()
return GrobidHeader(**info)
+
def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
"""
Use this function to parse TEI-XML of a full document or header processed
@@ -155,7 +161,6 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
tree = _string_to_tree(xml_text)
tei = tree.getroot()
info = dict()
- encumbered = True
header = tei.find(f".//{{{ns}}}teiHeader")
if header is None:
@@ -188,17 +193,32 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
el = tei.find(f".//{{{ns}}}text/{{{ns}}}body")
doc.body = (el or None) and " ".join(el.itertext()).strip()
el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="acknowledgement"]')
- doc.acknowledgement = (el or None) and " ".join(
- el.itertext()).strip()
+ doc.acknowledgement = (el or None) and " ".join(el.itertext()).strip()
el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="annex"]')
doc.annex = (el or None) and " ".join(el.itertext()).strip()
return doc
+
def parse_citations_xml(xml_text: AnyStr) -> List[GrobidCitation]:
"""
Use this function to parse TEI-XML of one or more references.
Eg, the output of '/api/processReferences' or '/api/processCitation'.
"""
+ # XXX: this replacement shouldn't be needed?
+ xml_text = xml_text.replace('xmlns="http://www.tei-c.org/ns/1.0"', "")
tree = _string_to_tree(xml_text)
+ root = tree.getroot()
+
+ if root.tag == 'biblStruct':
+ ref = _parse_citation(root, ns='')
+ ref.index = 0
+ return [ref]
+
+ refs = []
+ for (i, bs) in enumerate(tree.findall(f".//biblStruct")):
+ ref = _parse_citation(bs, ns='')
+ ref.index = i
+ refs.append(ref)
+ return refs
diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py
index 795d37f..aabe424 100644
--- a/grobid_tei_xml/types.py
+++ b/grobid_tei_xml/types.py
@@ -1,6 +1,5 @@
-
+from dataclasses import asdict, dataclass
from typing import Any, AnyStr, Dict, List, Optional
-from dataclasses import dataclass
@dataclass
@@ -11,6 +10,7 @@ class GrobidAddress:
country: Optional[str] = None
country_code: Optional[str] = None
+
@dataclass
class GrobidAffiliation:
address: Optional[GrobidAddress] = None
@@ -18,6 +18,7 @@ class GrobidAffiliation:
department: Optional[str] = None
laboratory: Optional[str] = None
+
@dataclass
class GrobidAuthor:
name: Optional[str]
@@ -26,6 +27,7 @@ class GrobidAuthor:
surname: Optional[str] = None
affiliation: Optional[dict] = None
+
@dataclass
class GrobidCitation:
authors: List[GrobidAuthor]
@@ -52,6 +54,7 @@ class GrobidCitation:
def to_dict(self) -> dict:
return _simplify_dict(asdict(self))
+
@dataclass
class GrobidJournal:
name: Optional[str] = None
@@ -62,6 +65,7 @@ class GrobidJournal:
issn: Optional[str] = None
eissn: Optional[str] = None
+
@dataclass
class GrobidHeader:
title: Optional[str] = None
@@ -71,6 +75,7 @@ class GrobidHeader:
#TODO: note: Optional[str]
journal: Optional[GrobidJournal] = None
+
@dataclass
class GrobidDocument:
grobid_version: str
@@ -87,10 +92,21 @@ class GrobidDocument:
def to_dict(self) -> dict:
return _simplify_dict(asdict(self))
+
def _simplify_dict(d: dict) -> dict:
+ """
+ Recursively remove empty dict values from a dict and all sub-lists and
+ sub-dicts.
+ """
+ if d in [None, {}, '']:
+ return None
for k in list(d.keys()):
if isinstance(d[k], dict):
d[k] = _simplify_dict(d[k])
- if d[k] in [None, [], {}, '']:
+ elif isinstance(d[k], list):
+ for i in range(len(d[k])):
+ if isinstance(d[k][i], dict):
+ d[k][i] = _simplify_dict(d[k][i])
+ if d[k] in [None, {}, '']:
d.pop(k)
return d