From fa1e2b68fdc8426b0e6239c65361b605eba5fe7b Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 22 Oct 2021 19:04:16 -0700 Subject: initial CSL conversion test --- grobid_tei_xml/types.py | 34 +++++++++++++++++++++++++++++++++- tests/test_csl.py | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 1 deletion(-) create mode 100644 tests/test_csl.py diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py index b78b236..199e746 100644 --- a/grobid_tei_xml/types.py +++ b/grobid_tei_xml/types.py @@ -99,7 +99,8 @@ class GrobidCitation: def to_csl_dict(self, default_type: str = "article-journal") -> dict: """ - Transforms in to Citation Style Language (CSL) JSON schema + Transforms in to Citation Style Language (CSL) JSON schema, as a dict + (not an actual JSON string) """ csl = dict( type=default_type, @@ -148,6 +149,27 @@ class GrobidHeader: doi: Optional[str] = None journal: Optional[GrobidJournal] = None + def to_csl_dict(self, default_type: str = "article-journal") -> dict: + + csl = dict( + type=default_type, + author=[a.to_csl_dict() for a in self.authors or []], + issued=_csl_date(self.date), + title=self.title, + DOI=self.doi, + ) + + if self.journal: + csl['publisher'] = self.journal.publisher + if self.journal.name: + csl['container-title'] = self.journal.name + if self.journal.issue and self.journal.issue.isdigit(): + csl['issue'] = int(self.issue) + if self.journal.volume and self.journal.volume.isdigit(): + csl['volume'] = int(self.volume) + + return _simplify_dict(csl) + @dataclass class GrobidDocument: @@ -209,6 +231,16 @@ class GrobidDocument: self.acknowledgement = None self.annex = None + def to_csl_dict(self, default_type: str = "article-journal") -> dict: + """ + Transforms in to Citation Style Language (CSL) JSON schema, as a dict + (not an actual JSON string) + """ + if not self.header: + return {} + else: + return self.header.to_csl_dict(default_type=default_type) + def _simplify_dict(d: dict) -> dict: """ diff --git a/tests/test_csl.py b/tests/test_csl.py new file mode 100644 index 0000000..9c8bd5f --- /dev/null +++ b/tests/test_csl.py @@ -0,0 +1,40 @@ +from grobid_tei_xml import parse_document_xml + + +def test_small_xml_csl() -> None: + + with open('tests/files/small.xml', 'r') as f: + tei_xml = f.read() + + d = parse_document_xml(tei_xml) + assert d.to_csl_dict() == { + "type": "article-journal", + "title": "Dummy Example File", + "author": [ + { + "given": "Brewster", + "family": "Kahle" + }, + { + "given": "J", + "family": "Doe", + }, + ], + "container-title": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", + "issued": [[2000]], + } + + assert d.citations[0].to_csl_dict() == { + "type": "article-journal", + "title": "Everything is Wonderful", + "author": [ + { + "given": "A", + "family": "Seaperson" + }, + ], + "container-title": "Letters in the Alphabet", + "issued": [[2001]], + "volume": 20, + "page": "1-11", + } -- cgit v1.2.3