aboutsummaryrefslogtreecommitdiffstats
path: root/grobid_tei_xml/types.py
blob: 9894bf5fafcdfb29641639f99daa7da747edcfe0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
from dataclasses import asdict, dataclass
from typing import List, Optional


@dataclass
class GrobidAddress:
    addr_line: Optional[str] = None
    post_code: Optional[str] = None
    settlement: Optional[str] = None
    country: Optional[str] = None
    country_code: Optional[str] = None


@dataclass
class GrobidAffiliation:
    address: Optional[GrobidAddress] = None
    institution: Optional[str] = None
    department: Optional[str] = None
    laboratory: Optional[str] = None


@dataclass
class GrobidAuthor:
    name: Optional[str]
    # TODO: 'forename'?
    given_name: Optional[str] = None
    surname: Optional[str] = None
    affiliation: Optional[GrobidAffiliation] = None


@dataclass
class GrobidCitation:
    authors: List[GrobidAuthor]
    index: Optional[int] = None
    id: Optional[str] = None
    date: Optional[str] = None
    issue: Optional[str] = None
    journal: Optional[str] = None
    publisher: Optional[str] = None
    title: Optional[str] = None
    url: Optional[str] = None
    volume: Optional[str] = None
    pages: Optional[str] = None
    first_page: Optional[str] = None
    last_page: Optional[str] = None
    unstructured: Optional[str] = None
    # TODO: 'arxiv' for consistency?
    arxiv_id: Optional[str] = None
    doi: Optional[str] = None
    pmid: Optional[str] = None
    pmcid: Optional[str] = None
    oa_url: Optional[str] = None

    def to_dict(self) -> dict:
        return _simplify_dict(asdict(self))


@dataclass
class GrobidJournal:
    name: Optional[str] = None
    abbrev: Optional[str] = None
    publisher: Optional[str] = None
    volume: Optional[str] = None
    issue: Optional[str] = None
    issn: Optional[str] = None
    eissn: Optional[str] = None


@dataclass
class GrobidHeader:
    authors: List[GrobidAuthor]
    title: Optional[str] = None
    date: Optional[str] = None
    doi: Optional[str] = None
    note: Optional[str] = None
    journal: Optional[GrobidJournal] = None


@dataclass
class GrobidDocument:
    grobid_version: str
    grobid_timestamp: str
    header: GrobidHeader
    pdf_md5: Optional[str] = None
    citations: Optional[List[GrobidCitation]] = None
    language_code: Optional[str] = None
    abstract: Optional[str] = None
    body: Optional[str] = None
    acknowledgement: Optional[str] = None
    annex: Optional[str] = None

    def to_dict(self) -> dict:
        """
        Returns a dict version of this object which has no 'None' fields
        (recursively), and is appropriate for serializing to JSON with
        json.dumps().

        If you did want all the fields, you could use dataclasses.asdict()
        directly on thing object.
        """
        return _simplify_dict(asdict(self))

    def to_legacy_dict(self) -> dict:
        """
        Returns a dict in the old "grobid2json" format.
        """
        d = self.to_dict()

        # all header fields at top-level
        d.update(d.pop('header', {}))
        d.pop('note', None)
        d.pop('pdf_md5', None)
        for a in d['authors']:
            addr = a.get('affiliation', {}).get('address')
            if addr and addr.get('post_code'):
                addr['postCode'] = addr.pop('post_code')
        return d

    def remove_encumbered(self) -> None:
        """
        This helper function removes fields from this object which might raise
        copyright concerns.
        """
        self.abstract = None
        self.body = None
        self.acknowledgement = None
        self.annex = None


def _simplify_dict(d: dict) -> dict:
    """
    Recursively remove empty dict values from a dict and all sub-lists and
    sub-dicts.

    TODO: should this return Optional[dict]?
    """
    if d in [None, {}, '']:
        return {}
    for k in list(d.keys()):
        if isinstance(d[k], dict):
            d[k] = _simplify_dict(d[k])
        elif isinstance(d[k], list):
            for i in range(len(d[k])):
                if isinstance(d[k][i], dict):
                    d[k][i] = _simplify_dict(d[k][i])
        if d[k] in [None, {}, '']:
            d.pop(k)
    return d