aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-22 19:04:16 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-22 19:04:16 -0700
commitfa1e2b68fdc8426b0e6239c65361b605eba5fe7b (patch)
tree0649acda1c606b2483a67f5c5c4a1168f03afc5b
parent3456336d3e4324a542c16b91734a8ebd8ef99ab9 (diff)
downloadgrobid_tei_xml-fa1e2b68fdc8426b0e6239c65361b605eba5fe7b.tar.gz
grobid_tei_xml-fa1e2b68fdc8426b0e6239c65361b605eba5fe7b.zip
initial CSL conversion test
-rw-r--r--grobid_tei_xml/types.py34
-rw-r--r--tests/test_csl.py40
2 files changed, 73 insertions, 1 deletions
diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py
index b78b236..199e746 100644
--- a/grobid_tei_xml/types.py
+++ b/grobid_tei_xml/types.py
@@ -99,7 +99,8 @@ class GrobidCitation:
def to_csl_dict(self, default_type: str = "article-journal") -> dict:
"""
- Transforms in to Citation Style Language (CSL) JSON schema
+ Transforms in to Citation Style Language (CSL) JSON schema, as a dict
+ (not an actual JSON string)
"""
csl = dict(
type=default_type,
@@ -148,6 +149,27 @@ class GrobidHeader:
doi: Optional[str] = None
journal: Optional[GrobidJournal] = None
+ def to_csl_dict(self, default_type: str = "article-journal") -> dict:
+
+ csl = dict(
+ type=default_type,
+ author=[a.to_csl_dict() for a in self.authors or []],
+ issued=_csl_date(self.date),
+ title=self.title,
+ DOI=self.doi,
+ )
+
+ if self.journal:
+ csl['publisher'] = self.journal.publisher
+ if self.journal.name:
+ csl['container-title'] = self.journal.name
+ if self.journal.issue and self.journal.issue.isdigit():
+ csl['issue'] = int(self.issue)
+ if self.journal.volume and self.journal.volume.isdigit():
+ csl['volume'] = int(self.volume)
+
+ return _simplify_dict(csl)
+
@dataclass
class GrobidDocument:
@@ -209,6 +231,16 @@ class GrobidDocument:
self.acknowledgement = None
self.annex = None
+ def to_csl_dict(self, default_type: str = "article-journal") -> dict:
+ """
+ Transforms in to Citation Style Language (CSL) JSON schema, as a dict
+ (not an actual JSON string)
+ """
+ if not self.header:
+ return {}
+ else:
+ return self.header.to_csl_dict(default_type=default_type)
+
def _simplify_dict(d: dict) -> dict:
"""
diff --git a/tests/test_csl.py b/tests/test_csl.py
new file mode 100644
index 0000000..9c8bd5f
--- /dev/null
+++ b/tests/test_csl.py
@@ -0,0 +1,40 @@
+from grobid_tei_xml import parse_document_xml
+
+
+def test_small_xml_csl() -> None:
+
+ with open('tests/files/small.xml', 'r') as f:
+ tei_xml = f.read()
+
+ d = parse_document_xml(tei_xml)
+ assert d.to_csl_dict() == {
+ "type": "article-journal",
+ "title": "Dummy Example File",
+ "author": [
+ {
+ "given": "Brewster",
+ "family": "Kahle"
+ },
+ {
+ "given": "J",
+ "family": "Doe",
+ },
+ ],
+ "container-title": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
+ "issued": [[2000]],
+ }
+
+ assert d.citations[0].to_csl_dict() == {
+ "type": "article-journal",
+ "title": "Everything is Wonderful",
+ "author": [
+ {
+ "given": "A",
+ "family": "Seaperson"
+ },
+ ],
+ "container-title": "Letters in the Alphabet",
+ "issued": [[2001]],
+ "volume": 20,
+ "page": "1-11",
+ }