aboutsummaryrefslogtreecommitdiffstats
path: root/grobid_tei_xml/types.py
diff options
context:
space:
mode:
Diffstat (limited to 'grobid_tei_xml/types.py')
-rw-r--r--grobid_tei_xml/types.py20
1 files changed, 18 insertions, 2 deletions
diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py
index b86e1a4..9894bf5 100644
--- a/grobid_tei_xml/types.py
+++ b/grobid_tei_xml/types.py
@@ -72,7 +72,7 @@ class GrobidHeader:
title: Optional[str] = None
date: Optional[str] = None
doi: Optional[str] = None
- # TODO: note: Optional[str]
+ note: Optional[str] = None
journal: Optional[GrobidJournal] = None
@@ -80,8 +80,8 @@ class GrobidHeader:
class GrobidDocument:
grobid_version: str
grobid_timestamp: str
- # TODO: pdf_md5: Optional[str]
header: GrobidHeader
+ pdf_md5: Optional[str] = None
citations: Optional[List[GrobidCitation]] = None
language_code: Optional[str] = None
abstract: Optional[str] = None
@@ -100,6 +100,22 @@ class GrobidDocument:
"""
return _simplify_dict(asdict(self))
+ def to_legacy_dict(self) -> dict:
+ """
+ Returns a dict in the old "grobid2json" format.
+ """
+ d = self.to_dict()
+
+ # all header fields at top-level
+ d.update(d.pop('header', {}))
+ d.pop('note', None)
+ d.pop('pdf_md5', None)
+ for a in d['authors']:
+ addr = a.get('affiliation', {}).get('address')
+ if addr and addr.get('post_code'):
+ addr['postCode'] = addr.pop('post_code')
+ return d
+
def remove_encumbered(self) -> None:
"""
This helper function removes fields from this object which might raise