diff options
Diffstat (limited to 'grobid_tei_xml/types.py')
-rw-r--r-- | grobid_tei_xml/types.py | 105 |
1 files changed, 94 insertions, 11 deletions
diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py index 9894bf5..b78b236 100644 --- a/grobid_tei_xml/types.py +++ b/grobid_tei_xml/types.py @@ -8,52 +8,125 @@ class GrobidAddress: post_code: Optional[str] = None settlement: Optional[str] = None country: Optional[str] = None - country_code: Optional[str] = None + country_code: Optional[str] = None # XXX @dataclass class GrobidAffiliation: - address: Optional[GrobidAddress] = None institution: Optional[str] = None department: Optional[str] = None laboratory: Optional[str] = None + address: Optional[GrobidAddress] = None @dataclass class GrobidAuthor: - name: Optional[str] - # TODO: 'forename'? + full_name: Optional[str] given_name: Optional[str] = None + middle: Optional[str] = None # XXX surname: Optional[str] = None + suffix: Optional[str] = None # XXX + email: Optional[str] = None # XXX affiliation: Optional[GrobidAffiliation] = None + def to_csl_dict(self) -> dict: + d = dict( + given=self.given_name, + family=self.surname, + suffix=self.suffix, + ) + return _simplify_dict(d) + + +def _csl_date(s: Optional[str]) -> Optional[list]: + if not s: + return None + + # YYYY + if len(s) >= 4 and s[0:4].isdigit(): + year = int(s[0:4]) + else: + return None + + # YYYY-MM + if len(s) >= 7 and s[4] == '-' and s[5:7].isdigit(): + month = int(s[5:7]) + else: + return [[year]] + + # YYYY-MM-DD + if len(s) == 10 and s[7] == '-' and s[8:10].isdigit(): + day = int(s[8:10]) + return [[year, month, day]] + else: + return [[year, month]] + + +def test_csl_date() -> None: + assert _csl_date("1998") == [[1998]] + assert _csl_date("1998-03") == [[1998, 3]] + assert _csl_date("1998-03-12") == [[1998, 3, 12]] + assert _csl_date("1998-blah") == [[1998]] + assert _csl_date("asdf") is None + @dataclass class GrobidCitation: authors: List[GrobidAuthor] + index: Optional[int] = None id: Optional[str] = None date: Optional[str] = None issue: Optional[str] = None - journal: Optional[str] = None + journal: Optional[str] = None # XXX: venue? other? publisher: Optional[str] = None title: Optional[str] = None url: Optional[str] = None volume: Optional[str] = None pages: Optional[str] = None - first_page: Optional[str] = None - last_page: Optional[str] = None + first_page: Optional[str] = None # XXX + last_page: Optional[str] = None # XXX unstructured: Optional[str] = None - # TODO: 'arxiv' for consistency? arxiv_id: Optional[str] = None doi: Optional[str] = None pmid: Optional[str] = None pmcid: Optional[str] = None oa_url: Optional[str] = None + note: Optional[str] = None def to_dict(self) -> dict: return _simplify_dict(asdict(self)) + def to_csl_dict(self, default_type: str = "article-journal") -> dict: + """ + Transforms in to Citation Style Language (CSL) JSON schema + """ + csl = dict( + type=default_type, + author=[a.to_csl_dict() for a in self.authors], + issued=_csl_date(self.date), + publisher=self.publisher, + title=self.title, + page=self.pages, + URL=self.url, + DOI=self.doi, + PMID=self.pmid, + PMCID=self.pmcid, + note=self.note, + # fields with '-' in the key name + **{ + "container-title": self.journal, + "page-first": self.first_page, + }) + + # numeric fields + if self.issue and self.issue.isdigit(): + csl['issue'] = int(self.issue) + if self.volume and self.volume.isdigit(): + csl['volume'] = int(self.volume) + + return _simplify_dict(csl) + @dataclass class GrobidJournal: @@ -69,10 +142,10 @@ class GrobidJournal: @dataclass class GrobidHeader: authors: List[GrobidAuthor] + title: Optional[str] = None date: Optional[str] = None doi: Optional[str] = None - note: Optional[str] = None journal: Optional[GrobidJournal] = None @@ -81,9 +154,10 @@ class GrobidDocument: grobid_version: str grobid_timestamp: str header: GrobidHeader + pdf_md5: Optional[str] = None - citations: Optional[List[GrobidCitation]] = None language_code: Optional[str] = None + citations: Optional[List[GrobidCitation]] = None abstract: Optional[str] = None body: Optional[str] = None acknowledgement: Optional[str] = None @@ -108,12 +182,21 @@ class GrobidDocument: # all header fields at top-level d.update(d.pop('header', {})) - d.pop('note', None) + + # files not in the old schema d.pop('pdf_md5', None) + for c in d.get('citations', []): + c.pop('note', None) + + # author changes for a in d['authors']: + a['name'] = a.pop('full_name') addr = a.get('affiliation', {}).get('address') if addr and addr.get('post_code'): addr['postCode'] = addr.pop('post_code') + for c in d['citations'] or []: + for a in c['authors']: + a['name'] = a.pop('full_name') return d def remove_encumbered(self) -> None: |