aboutsummaryrefslogtreecommitdiffstats
path: root/grobid_tei_xml/types.py
diff options
context:
space:
mode:
Diffstat (limited to 'grobid_tei_xml/types.py')
-rw-r--r--grobid_tei_xml/types.py105
1 files changed, 94 insertions, 11 deletions
diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py
index 9894bf5..b78b236 100644
--- a/grobid_tei_xml/types.py
+++ b/grobid_tei_xml/types.py
@@ -8,52 +8,125 @@ class GrobidAddress:
post_code: Optional[str] = None
settlement: Optional[str] = None
country: Optional[str] = None
- country_code: Optional[str] = None
+ country_code: Optional[str] = None # XXX
@dataclass
class GrobidAffiliation:
- address: Optional[GrobidAddress] = None
institution: Optional[str] = None
department: Optional[str] = None
laboratory: Optional[str] = None
+ address: Optional[GrobidAddress] = None
@dataclass
class GrobidAuthor:
- name: Optional[str]
- # TODO: 'forename'?
+ full_name: Optional[str]
given_name: Optional[str] = None
+ middle: Optional[str] = None # XXX
surname: Optional[str] = None
+ suffix: Optional[str] = None # XXX
+ email: Optional[str] = None # XXX
affiliation: Optional[GrobidAffiliation] = None
+ def to_csl_dict(self) -> dict:
+ d = dict(
+ given=self.given_name,
+ family=self.surname,
+ suffix=self.suffix,
+ )
+ return _simplify_dict(d)
+
+
+def _csl_date(s: Optional[str]) -> Optional[list]:
+ if not s:
+ return None
+
+ # YYYY
+ if len(s) >= 4 and s[0:4].isdigit():
+ year = int(s[0:4])
+ else:
+ return None
+
+ # YYYY-MM
+ if len(s) >= 7 and s[4] == '-' and s[5:7].isdigit():
+ month = int(s[5:7])
+ else:
+ return [[year]]
+
+ # YYYY-MM-DD
+ if len(s) == 10 and s[7] == '-' and s[8:10].isdigit():
+ day = int(s[8:10])
+ return [[year, month, day]]
+ else:
+ return [[year, month]]
+
+
+def test_csl_date() -> None:
+ assert _csl_date("1998") == [[1998]]
+ assert _csl_date("1998-03") == [[1998, 3]]
+ assert _csl_date("1998-03-12") == [[1998, 3, 12]]
+ assert _csl_date("1998-blah") == [[1998]]
+ assert _csl_date("asdf") is None
+
@dataclass
class GrobidCitation:
authors: List[GrobidAuthor]
+
index: Optional[int] = None
id: Optional[str] = None
date: Optional[str] = None
issue: Optional[str] = None
- journal: Optional[str] = None
+ journal: Optional[str] = None # XXX: venue? other?
publisher: Optional[str] = None
title: Optional[str] = None
url: Optional[str] = None
volume: Optional[str] = None
pages: Optional[str] = None
- first_page: Optional[str] = None
- last_page: Optional[str] = None
+ first_page: Optional[str] = None # XXX
+ last_page: Optional[str] = None # XXX
unstructured: Optional[str] = None
- # TODO: 'arxiv' for consistency?
arxiv_id: Optional[str] = None
doi: Optional[str] = None
pmid: Optional[str] = None
pmcid: Optional[str] = None
oa_url: Optional[str] = None
+ note: Optional[str] = None
def to_dict(self) -> dict:
return _simplify_dict(asdict(self))
+ def to_csl_dict(self, default_type: str = "article-journal") -> dict:
+ """
+ Transforms in to Citation Style Language (CSL) JSON schema
+ """
+ csl = dict(
+ type=default_type,
+ author=[a.to_csl_dict() for a in self.authors],
+ issued=_csl_date(self.date),
+ publisher=self.publisher,
+ title=self.title,
+ page=self.pages,
+ URL=self.url,
+ DOI=self.doi,
+ PMID=self.pmid,
+ PMCID=self.pmcid,
+ note=self.note,
+ # fields with '-' in the key name
+ **{
+ "container-title": self.journal,
+ "page-first": self.first_page,
+ })
+
+ # numeric fields
+ if self.issue and self.issue.isdigit():
+ csl['issue'] = int(self.issue)
+ if self.volume and self.volume.isdigit():
+ csl['volume'] = int(self.volume)
+
+ return _simplify_dict(csl)
+
@dataclass
class GrobidJournal:
@@ -69,10 +142,10 @@ class GrobidJournal:
@dataclass
class GrobidHeader:
authors: List[GrobidAuthor]
+
title: Optional[str] = None
date: Optional[str] = None
doi: Optional[str] = None
- note: Optional[str] = None
journal: Optional[GrobidJournal] = None
@@ -81,9 +154,10 @@ class GrobidDocument:
grobid_version: str
grobid_timestamp: str
header: GrobidHeader
+
pdf_md5: Optional[str] = None
- citations: Optional[List[GrobidCitation]] = None
language_code: Optional[str] = None
+ citations: Optional[List[GrobidCitation]] = None
abstract: Optional[str] = None
body: Optional[str] = None
acknowledgement: Optional[str] = None
@@ -108,12 +182,21 @@ class GrobidDocument:
# all header fields at top-level
d.update(d.pop('header', {}))
- d.pop('note', None)
+
+ # files not in the old schema
d.pop('pdf_md5', None)
+ for c in d.get('citations', []):
+ c.pop('note', None)
+
+ # author changes
for a in d['authors']:
+ a['name'] = a.pop('full_name')
addr = a.get('affiliation', {}).get('address')
if addr and addr.get('post_code'):
addr['postCode'] = addr.pop('post_code')
+ for c in d['citations'] or []:
+ for a in c['authors']:
+ a['name'] = a.pop('full_name')
return d
def remove_encumbered(self) -> None: