diff options
Diffstat (limited to 'grobid_tei_xml/types.py')
-rw-r--r-- | grobid_tei_xml/types.py | 20 |
1 files changed, 18 insertions, 2 deletions
diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py index b86e1a4..9894bf5 100644 --- a/grobid_tei_xml/types.py +++ b/grobid_tei_xml/types.py @@ -72,7 +72,7 @@ class GrobidHeader: title: Optional[str] = None date: Optional[str] = None doi: Optional[str] = None - # TODO: note: Optional[str] + note: Optional[str] = None journal: Optional[GrobidJournal] = None @@ -80,8 +80,8 @@ class GrobidHeader: class GrobidDocument: grobid_version: str grobid_timestamp: str - # TODO: pdf_md5: Optional[str] header: GrobidHeader + pdf_md5: Optional[str] = None citations: Optional[List[GrobidCitation]] = None language_code: Optional[str] = None abstract: Optional[str] = None @@ -100,6 +100,22 @@ class GrobidDocument: """ return _simplify_dict(asdict(self)) + def to_legacy_dict(self) -> dict: + """ + Returns a dict in the old "grobid2json" format. + """ + d = self.to_dict() + + # all header fields at top-level + d.update(d.pop('header', {})) + d.pop('note', None) + d.pop('pdf_md5', None) + for a in d['authors']: + addr = a.get('affiliation', {}).get('address') + if addr and addr.get('post_code'): + addr['postCode'] = addr.pop('post_code') + return d + def remove_encumbered(self) -> None: """ This helper function removes fields from this object which might raise |