diff options
Diffstat (limited to 'grobid_tei_xml/types.py')
-rw-r--r-- | grobid_tei_xml/types.py | 96 |
1 files changed, 96 insertions, 0 deletions
diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py new file mode 100644 index 0000000..795d37f --- /dev/null +++ b/grobid_tei_xml/types.py @@ -0,0 +1,96 @@ + +from typing import Any, AnyStr, Dict, List, Optional +from dataclasses import dataclass + + +@dataclass +class GrobidAddress: + addr_line: Optional[str] = None + post_code: Optional[str] = None + settlement: Optional[str] = None + country: Optional[str] = None + country_code: Optional[str] = None + +@dataclass +class GrobidAffiliation: + address: Optional[GrobidAddress] = None + institution: Optional[str] = None + department: Optional[str] = None + laboratory: Optional[str] = None + +@dataclass +class GrobidAuthor: + name: Optional[str] + # TODO: 'forename'? + given_name: Optional[str] = None + surname: Optional[str] = None + affiliation: Optional[dict] = None + +@dataclass +class GrobidCitation: + authors: List[GrobidAuthor] + index: Optional[int] = None + id: Optional[str] = None + date: Optional[str] = None + issue: Optional[str] = None + journal: Optional[str] = None + publisher: Optional[str] = None + title: Optional[str] = None + url: Optional[str] = None + volume: Optional[str] = None + pages: Optional[str] = None + first_page: Optional[str] = None + last_page: Optional[str] = None + unstructured: Optional[str] = None + # TODO: 'arxiv' for consistency? + arxiv_id: Optional[str] = None + doi: Optional[str] = None + pmid: Optional[str] = None + pmcid: Optional[str] = None + oa_url: Optional[str] = None + + def to_dict(self) -> dict: + return _simplify_dict(asdict(self)) + +@dataclass +class GrobidJournal: + name: Optional[str] = None + abbrev: Optional[str] = None + publisher: Optional[str] = None + volume: Optional[str] = None + issue: Optional[str] = None + issn: Optional[str] = None + eissn: Optional[str] = None + +@dataclass +class GrobidHeader: + title: Optional[str] = None + authors: Optional[str] = None + date: Optional[str] = None + doi: Optional[str] = None + #TODO: note: Optional[str] + journal: Optional[GrobidJournal] = None + +@dataclass +class GrobidDocument: + grobid_version: str + grobid_timestamp: str + #TODO: pdf_md5: Optional[str] + header: GrobidHeader + citations: Optional[List[GrobidCitation]] = None + language_code: Optional[str] = None + abstract: Optional[str] = None + body: Optional[str] = None + acknowledgement: Optional[str] = None + annex: Optional[str] = None + + def to_dict(self) -> dict: + return _simplify_dict(asdict(self)) + +def _simplify_dict(d: dict) -> dict: + for k in list(d.keys()): + if isinstance(d[k], dict): + d[k] = _simplify_dict(d[k]) + if d[k] in [None, [], {}, '']: + d.pop(k) + return d |