diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-22 19:04:16 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-22 19:04:16 -0700 |
commit | fa1e2b68fdc8426b0e6239c65361b605eba5fe7b (patch) | |
tree | 0649acda1c606b2483a67f5c5c4a1168f03afc5b | |
parent | 3456336d3e4324a542c16b91734a8ebd8ef99ab9 (diff) | |
download | grobid_tei_xml-fa1e2b68fdc8426b0e6239c65361b605eba5fe7b.tar.gz grobid_tei_xml-fa1e2b68fdc8426b0e6239c65361b605eba5fe7b.zip |
initial CSL conversion test
-rw-r--r-- | grobid_tei_xml/types.py | 34 | ||||
-rw-r--r-- | tests/test_csl.py | 40 |
2 files changed, 73 insertions, 1 deletions
diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py index b78b236..199e746 100644 --- a/grobid_tei_xml/types.py +++ b/grobid_tei_xml/types.py @@ -99,7 +99,8 @@ class GrobidCitation: def to_csl_dict(self, default_type: str = "article-journal") -> dict: """ - Transforms in to Citation Style Language (CSL) JSON schema + Transforms in to Citation Style Language (CSL) JSON schema, as a dict + (not an actual JSON string) """ csl = dict( type=default_type, @@ -148,6 +149,27 @@ class GrobidHeader: doi: Optional[str] = None journal: Optional[GrobidJournal] = None + def to_csl_dict(self, default_type: str = "article-journal") -> dict: + + csl = dict( + type=default_type, + author=[a.to_csl_dict() for a in self.authors or []], + issued=_csl_date(self.date), + title=self.title, + DOI=self.doi, + ) + + if self.journal: + csl['publisher'] = self.journal.publisher + if self.journal.name: + csl['container-title'] = self.journal.name + if self.journal.issue and self.journal.issue.isdigit(): + csl['issue'] = int(self.issue) + if self.journal.volume and self.journal.volume.isdigit(): + csl['volume'] = int(self.volume) + + return _simplify_dict(csl) + @dataclass class GrobidDocument: @@ -209,6 +231,16 @@ class GrobidDocument: self.acknowledgement = None self.annex = None + def to_csl_dict(self, default_type: str = "article-journal") -> dict: + """ + Transforms in to Citation Style Language (CSL) JSON schema, as a dict + (not an actual JSON string) + """ + if not self.header: + return {} + else: + return self.header.to_csl_dict(default_type=default_type) + def _simplify_dict(d: dict) -> dict: """ diff --git a/tests/test_csl.py b/tests/test_csl.py new file mode 100644 index 0000000..9c8bd5f --- /dev/null +++ b/tests/test_csl.py @@ -0,0 +1,40 @@ +from grobid_tei_xml import parse_document_xml + + +def test_small_xml_csl() -> None: + + with open('tests/files/small.xml', 'r') as f: + tei_xml = f.read() + + d = parse_document_xml(tei_xml) + assert d.to_csl_dict() == { + "type": "article-journal", + "title": "Dummy Example File", + "author": [ + { + "given": "Brewster", + "family": "Kahle" + }, + { + "given": "J", + "family": "Doe", + }, + ], + "container-title": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", + "issued": [[2000]], + } + + assert d.citations[0].to_csl_dict() == { + "type": "article-journal", + "title": "Everything is Wonderful", + "author": [ + { + "given": "A", + "family": "Seaperson" + }, + ], + "container-title": "Letters in the Alphabet", + "issued": [[2001]], + "volume": 20, + "page": "1-11", + } |