diff options
Diffstat (limited to 'grobid_tei_xml/types.py')
-rw-r--r-- | grobid_tei_xml/types.py | 74 |
1 files changed, 38 insertions, 36 deletions
diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py index 08be47a..725871b 100644 --- a/grobid_tei_xml/types.py +++ b/grobid_tei_xml/types.py @@ -47,13 +47,13 @@ def _csl_date(s: Optional[str]) -> Optional[list]: return None # YYYY-MM - if len(s) >= 7 and s[4] == '-' and s[5:7].isdigit(): + if len(s) >= 7 and s[4] == "-" and s[5:7].isdigit(): month = int(s[5:7]) else: return [[year]] # YYYY-MM-DD - if len(s) == 10 and s[7] == '-' and s[8:10].isdigit(): + if len(s) == 10 and s[7] == "-" and s[8:10].isdigit(): day = int(s[8:10]) return [[year, month, day]] else: @@ -112,26 +112,26 @@ class GrobidBiblio: d = self.to_dict() # new keys - d.pop('first_page', None) - d.pop('last_page', None) - d.pop('note', None) + d.pop("first_page", None) + d.pop("last_page", None) + d.pop("note", None) # legacy book title behavior - if not d.get('journal') and d.get('book_title'): - d['journal'] = d.pop('book_title') + if not d.get("journal") and d.get("book_title"): + d["journal"] = d.pop("book_title") else: - d.pop('book_title', None) + d.pop("book_title", None) # author changes - for a in d['authors']: - a['name'] = a.pop('full_name', None) - if not a.get('given_name'): - a['given_name'] = a.pop('middle_name', None) + for a in d["authors"]: + a["name"] = a.pop("full_name", None) + if not a.get("given_name"): + a["given_name"] = a.pop("middle_name", None) else: - a.pop('middle_name', None) - addr = a.get('affiliation', {}).get('address') - if addr and addr.get('post_code'): - addr['postCode'] = addr.pop('post_code') + a.pop("middle_name", None) + addr = a.get("affiliation", {}).get("address") + if addr and addr.get("post_code"): + addr["postCode"] = addr.pop("post_code") return _simplify_dict(d) @@ -155,18 +155,20 @@ class GrobidBiblio: note=self.note, ) # fields with '-' in the key name - csl.update({ - "container-title": self.journal, - "book-title": self.book_title, - "series-title": self.series_title, - "page-first": self.first_page, - }) + csl.update( + { + "container-title": self.journal, + "book-title": self.book_title, + "series-title": self.series_title, + "page-first": self.first_page, + } + ) # numeric fields if self.issue and self.issue.isdigit(): - csl['issue'] = int(self.issue) + csl["issue"] = int(self.issue) if self.volume and self.volume.isdigit(): - csl['volume'] = int(self.volume) + csl["volume"] = int(self.volume) return _simplify_dict(csl) @@ -201,23 +203,23 @@ class GrobidDocument: Returns a dict in the old "grobid2json" format. """ d = self.to_dict() - d.pop('header', None) + d.pop("header", None) d.update(self.header.to_legacy_dict()) if self.citations: - d['citations'] = [c.to_legacy_dict() for c in self.citations] + d["citations"] = [c.to_legacy_dict() for c in self.citations] # all header fields at top-level - d['journal'] = dict( - name=d.pop('journal', None), - publisher=d.pop('publisher', None), - issn=d.pop('issn', None), - issne=d.pop('issne', None), - volume=d.pop('volume', None), - issue=d.pop('issue', None), + d["journal"] = dict( + name=d.pop("journal", None), + publisher=d.pop("publisher", None), + issn=d.pop("issn", None), + issne=d.pop("issne", None), + volume=d.pop("volume", None), + issue=d.pop("issue", None), ) # document fields not in the old schema - d.pop('pdf_md5', None) + d.pop("pdf_md5", None) return _simplify_dict(d) @@ -246,7 +248,7 @@ def _simplify_dict(d: dict) -> dict: TODO: should this return Optional[dict]? """ - if d in [None, {}, '']: + if d in [None, {}, ""]: return {} for k in list(d.keys()): if isinstance(d[k], dict): @@ -255,6 +257,6 @@ def _simplify_dict(d: dict) -> dict: for i in range(len(d[k])): if isinstance(d[k][i], dict): d[k][i] = _simplify_dict(d[k][i]) - if d[k] in [None, {}, '']: + if d[k] in [None, {}, ""]: d.pop(k) return d |