aboutsummaryrefslogtreecommitdiffstats
path: root/grobid_tei_xml
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-25 15:46:33 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-25 15:46:38 -0700
commitbaa6356b80b1a826eca77f74cc487d947d2fafd4 (patch)
tree85032c05af7561f2147358e12b8bf2f11136c2d6 /grobid_tei_xml
parent09668907c81492774986e11f0acd9b06090dfbe0 (diff)
downloadgrobid_tei_xml-baa6356b80b1a826eca77f74cc487d947d2fafd4.tar.gz
grobid_tei_xml-baa6356b80b1a826eca77f74cc487d947d2fafd4.zip
schema expansion; grobid v0.7.x examples and test coverage
Diffstat (limited to 'grobid_tei_xml')
-rwxr-xr-xgrobid_tei_xml/parse.py189
-rw-r--r--grobid_tei_xml/types.py95
2 files changed, 173 insertions, 111 deletions
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py
index 1d7eec7..da7ed97 100755
--- a/grobid_tei_xml/parse.py
+++ b/grobid_tei_xml/parse.py
@@ -25,63 +25,60 @@ def _string_to_tree(content: AnyStr) -> ET.ElementTree:
raise TypeError(f"expected XML as string or bytes, got: {type(content)}")
-def _parse_authors(elem: Optional[ET.Element], ns: str = ns) -> List[GrobidAuthor]:
+def _parse_author(elem: Optional[ET.Element], ns: str = ns) -> Optional[GrobidAuthor]:
"""
Internal helper to parse one or more TEI 'author' XML tags into
GrobidAuthor objects. 'author' could appear in document headers or
citations.
"""
- if not elem:
- return []
- authors = []
- for author_tag in elem.findall(f".//{{{ns}}}author"):
- persname_tag = author_tag.find(f"./{{{ns}}}persName")
- if persname_tag is None:
- # should we do something else here? it is possible to have author
- # without persName?
- continue
-
- # basic author name stuff
- given_name = persname_tag.findtext(f"./{{{ns}}}forename") or None
- surname = persname_tag.findtext(f"./{{{ns}}}surname") or None
- # instead create full_name from all the sub-components of the tag
- full_name = " ".join([t.strip() for t in persname_tag.itertext() if t.strip()]).strip()
- ga = GrobidAuthor(
- full_name=full_name or None,
- given_name=given_name,
- surname=surname,
- )
-
- # author affiliation
- affiliation_tag = author_tag.find(f"./{{{ns}}}affiliation")
- if affiliation_tag is not None:
- affiliation_dict: Dict[str, Any] = dict()
- for orgname_tag in affiliation_tag.findall(f"./{{{ns}}}orgName"):
- orgname_type = orgname_tag.get("type")
- if orgname_type:
- affiliation_dict[orgname_type] = orgname_tag.text or None
- if affiliation_dict:
- ga.affiliation = GrobidAffiliation(
- institution=affiliation_dict.get('institution'),
- department=affiliation_dict.get('department'),
- laboratory=affiliation_dict.get('laboratory'),
- )
- address_tag = affiliation_tag.find(f"./{{{ns}}}address")
- if address_tag is not None:
- address_dict = dict()
- for t in list(address_tag):
- address_dict[t.tag.split("}")[-1]] = t.text or None
- if address_dict:
- ga.affiliation.address = GrobidAddress(
- addr_line=address_dict.get('addrLine'),
- post_code=address_dict.get('postCode'),
- settlement=address_dict.get('settlement'),
- country=address_dict.get('country'),
- )
- authors.append(ga)
-
- return authors
+ if elem is None:
+ return None
+ persname_tag = elem.find(f"./{{{ns}}}persName")
+ if persname_tag is None:
+ # should we do something else here? it is possible to have author
+ # without persName?
+ return None
+
+ # basic author name stuff
+ # instead create full_name from all the sub-components of the tag
+ full_name = " ".join([t.strip() for t in persname_tag.itertext() if t.strip()]).strip()
+ ga = GrobidAuthor(
+ full_name=full_name or None,
+ given_name=persname_tag.findtext(f'./{{{ns}}}forename[@type="first"]'),
+ middle_name=persname_tag.findtext(f'./{{{ns}}}forename[@type="middle"]'),
+ surname=persname_tag.findtext(f"./{{{ns}}}surname"),
+ email=persname_tag.findtext(f"./{{{ns}}}email"),
+ orcid=elem.findtext(f'.//{{{ns}}}idno[@type="ORCID"]'),
+ )
+
+ # author affiliation
+ affiliation_tag = elem.find(f"./{{{ns}}}affiliation")
+ if affiliation_tag is not None:
+ affiliation_dict: Dict[str, Any] = dict()
+ for orgname_tag in affiliation_tag.findall(f"./{{{ns}}}orgName"):
+ orgname_type = orgname_tag.get("type")
+ if orgname_type:
+ affiliation_dict[orgname_type] = orgname_tag.text or None
+ if affiliation_dict:
+ ga.affiliation = GrobidAffiliation(
+ institution=affiliation_dict.get('institution'),
+ department=affiliation_dict.get('department'),
+ laboratory=affiliation_dict.get('laboratory'),
+ )
+ address_tag = affiliation_tag.find(f"./{{{ns}}}address")
+ if address_tag is not None:
+ address_dict = dict()
+ for t in list(address_tag):
+ address_dict[t.tag.split("}")[-1]] = t.text or None
+ if address_dict:
+ ga.affiliation.address = GrobidAddress(
+ addr_line=address_dict.get('addrLine'),
+ post_code=address_dict.get('postCode'),
+ settlement=address_dict.get('settlement'),
+ country=address_dict.get('country'),
+ )
+ return ga
def _clean_url(url: Optional[str]) -> Optional[str]:
@@ -134,45 +131,75 @@ def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio:
Could be document header or a citation.
"""
+ authors = []
+ for ela in elem.findall(f".//{{{ns}}}author"):
+ a = _parse_author(ela, ns=ns)
+ if a is not None:
+ authors.append(a)
+
+ editors = []
+ editor_tags = elem.findall(f'.//{{{ns}}}editor')
+ if not editor_tags:
+ editor_tags = elem.findall(f'.//{{{ns}}}contributor[@role="editor"]')
+ for elt in editor_tags or []:
+ e = _parse_author(elt, ns=ns)
+ if e is not None:
+ editors.append(e)
+
biblio = GrobidBiblio(
- authors=_parse_authors(elem, ns=ns),
- id=elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") or None,
- unstructured=elem.findtext(f'.//{{{ns}}}note[@type="raw_reference"]') or None,
+ authors=authors,
+ editors=editors or None,
+ id=elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id"),
+ unstructured=elem.findtext(f'.//{{{ns}}}note[@type="raw_reference"]'),
# date below
- title=elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") or None,
- journal=elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") or None,
- journal_abbrev=None, # XXX
- publisher=elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") or None,
- volume=elem.findtext(f'.//{{{ns}}}biblScope[@unit="volume"]') or None,
- issue=elem.findtext(f'.//{{{ns}}}biblScope[@unit="issue"]') or None,
+ # titles: @level=a for article, @level=m for manuscrupt (book)
+ title=elem.findtext(f'.//{{{ns}}}title[@type="main"]'),
+ journal=elem.findtext(f'.//{{{ns}}}title[@level="j"]'),
+ journal_abbrev=elem.findtext(f'.//{{{ns}}}title[@level="j"][@type="abbrev"]'),
+ series_title=elem.findtext(f'.//{{{ns}}}title[@level="s"]'),
+ publisher=elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher"),
+ institution=elem.findtext(f".//{{{ns}}}respStmt/{{{ns}}}orgName"),
+ volume=elem.findtext(f'.//{{{ns}}}biblScope[@unit="volume"]'),
+ issue=elem.findtext(f'.//{{{ns}}}biblScope[@unit="issue"]'),
# pages below
- # XXX: note
- doi=elem.findtext(f'.//{{{ns}}}idno[@type="DOI"]') or None,
- pmid=elem.findtext(f'.//{{{ns}}}idno[@type="PMID"]') or None,
- pmcid=elem.findtext(f'.//{{{ns}}}idno[@type="PMCID"]') or None,
- arxiv_id=elem.findtext(f'.//{{{ns}}}idno[@type="arXiv"]') or None,
- issn=elem.findtext(f'.//{{{ns}}}idno[@type="ISSN"]') or None,
- eissn=elem.findtext(f'.//{{{ns}}}idno[@type="eISSN"]') or None,
+ doi=elem.findtext(f'.//{{{ns}}}idno[@type="DOI"]'),
+ pmid=elem.findtext(f'.//{{{ns}}}idno[@type="PMID"]'),
+ pmcid=elem.findtext(f'.//{{{ns}}}idno[@type="PMCID"]'),
+ arxiv_id=elem.findtext(f'.//{{{ns}}}idno[@type="arXiv"]'),
+ pii=elem.findtext(f'.//{{{ns}}}idno[@type="PII"]'),
+ ark=elem.findtext(f'.//{{{ns}}}idno[@type="ark"]'),
+ istex_id=elem.findtext(f'.//{{{ns}}}idno[@type="istexId"]'),
+ issn=elem.findtext(f'.//{{{ns}}}idno[@type="ISSN"]'),
+ eissn=elem.findtext(f'.//{{{ns}}}idno[@type="eISSN"]'),
)
+ book_title_tag = elem.find(f'.//{{{ns}}}title[@level="m"]')
+ if book_title_tag is not None and book_title_tag.attrib.get('type') is None:
+ biblio.book_title = book_title_tag.text
+ if biblio.book_title and not biblio.title:
+ biblio.title = biblio.book_title
+
+ note_tag = elem.find(f'.//{{{ns}}}note')
+ if note_tag is not None and note_tag.attrib.get('type') is None:
+ biblio.note = note_tag.text
+
if not biblio.publisher:
- biblio.publisher = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") or None
+ biblio.publisher = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher")
date_tag = elem.find(f'.//{{{ns}}}date[@type="published"]')
if date_tag is not None:
biblio.date = date_tag.attrib.get("when") or None
- # title stuff is messy in references...
- if biblio.journal and not biblio.title:
- biblio.title = biblio.journal
- biblio.journal = None
-
if biblio.arxiv_id and biblio.arxiv_id.startswith("arXiv:"):
biblio.arxiv_id = biblio.arxiv_id[6:]
el = elem.find(f'.//{{{ns}}}biblScope[@unit="page"]')
if el is not None:
+ if el.attrib.get("from"):
+ biblio.first_page = el.attrib["from"]
+ if el.attrib.get("to"):
+ biblio.last_page = el.attrib["to"]
if el.attrib.get("from") and el.attrib.get("to"):
biblio.pages = "{}-{}".format(el.attrib["from"], el.attrib["to"])
else:
@@ -205,7 +232,7 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
grobid_version=application_tag.attrib["version"].strip(),
grobid_timestamp=application_tag.attrib["when"].strip(),
header=_parse_biblio(header),
- pdf_md5=header.findtext(f'.//{{{ns}}}idno[@type="MD5"]') or None,
+ pdf_md5=header.findtext(f'.//{{{ns}}}idno[@type="MD5"]'),
)
refs = []
@@ -217,17 +244,23 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
text = tei.find(f".//{{{ns}}}text")
# print(text.attrib)
+
if text and text.attrib.get(f"{{{xml_ns}}}lang"):
+ # this is the 'body' language
doc.language_code = text.attrib[f"{{{xml_ns}}}lang"] # xml:lang
el = tei.find(f".//{{{ns}}}profileDesc/{{{ns}}}abstract")
- doc.abstract = (el or None) and " ".join(el.itertext()).strip()
+ if el is not None:
+ doc.abstract = " ".join(el.itertext()).strip() or None
el = tei.find(f".//{{{ns}}}text/{{{ns}}}body")
- doc.body = (el or None) and " ".join(el.itertext()).strip()
+ if el is not None:
+ doc.body = " ".join(el.itertext()).strip() or None
el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="acknowledgement"]')
- doc.acknowledgement = (el or None) and " ".join(el.itertext()).strip()
+ if el is not None:
+ doc.acknowledgement = " ".join(el.itertext()).strip() or None
el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="annex"]')
- doc.annex = (el or None) and " ".join(el.itertext()).strip()
+ if el is not None:
+ doc.annex = " ".join(el.itertext()).strip() or None
return doc
diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py
index 8356c8e..252e677 100644
--- a/grobid_tei_xml/types.py
+++ b/grobid_tei_xml/types.py
@@ -8,7 +8,6 @@ class GrobidAddress:
post_code: Optional[str] = None
settlement: Optional[str] = None
country: Optional[str] = None
- country_code: Optional[str] = None # XXX
@dataclass
@@ -23,17 +22,16 @@ class GrobidAffiliation:
class GrobidAuthor:
full_name: Optional[str]
given_name: Optional[str] = None
- middle: Optional[str] = None # XXX
+ middle_name: Optional[str] = None
surname: Optional[str] = None
- suffix: Optional[str] = None # XXX
email: Optional[str] = None # XXX
+ orcid: Optional[str] = None # XXX
affiliation: Optional[GrobidAffiliation] = None
def to_csl_dict(self) -> dict:
d = dict(
- given=self.given_name,
+ given=self.given_name or self.middle_name,
family=self.surname,
- suffix=self.suffix,
)
return _simplify_dict(d)
@@ -79,28 +77,64 @@ class GrobidBiblio:
date: Optional[str] = None
title: Optional[str] = None
- journal: Optional[str] = None # XXX: venue? other?
+ book_title: Optional[str] = None
+ series_title: Optional[str] = None
+ editors: Optional[List[GrobidAuthor]] = None
+ journal: Optional[str] = None
journal_abbrev: Optional[str] = None
publisher: Optional[str] = None
+ institution: Optional[str] = None
issn: Optional[str] = None
eissn: Optional[str] = None
volume: Optional[str] = None
issue: Optional[str] = None
pages: Optional[str] = None
- first_page: Optional[str] = None # XXX
- last_page: Optional[str] = None # XXX
+ first_page: Optional[str] = None
+ last_page: Optional[str] = None
note: Optional[str] = None
doi: Optional[str] = None
pmid: Optional[str] = None
pmcid: Optional[str] = None
arxiv_id: Optional[str] = None
+ pii: Optional[str] = None
+ ark: Optional[str] = None
+ istex_id: Optional[str] = None
url: Optional[str] = None
- oa_url: Optional[str] = None
def to_dict(self) -> dict:
return _simplify_dict(asdict(self))
+ def to_legacy_dict(self) -> dict:
+ """
+ Returns a dict in the old "grobid2json" format.
+ """
+ d = self.to_dict()
+
+ # new keys
+ d.pop('first_page', None)
+ d.pop('last_page', None)
+ d.pop('note', None)
+
+ # legacy book title behavior
+ if not d.get('journal') and d.get('book_title'):
+ d['journal'] = d.pop('book_title')
+ else:
+ d.pop('book_title', None)
+
+ # author changes
+ for a in d['authors']:
+ a['name'] = a.pop('full_name', None)
+ if not a.get('given_name'):
+ a['given_name'] = a.pop('middle_name', None)
+ else:
+ a.pop('middle_name', None)
+ addr = a.get('affiliation', {}).get('address')
+ if addr and addr.get('post_code'):
+ addr['postCode'] = addr.pop('post_code')
+
+ return _simplify_dict(d)
+
def to_csl_dict(self, default_type: str = "article-journal") -> dict:
"""
Transforms in to Citation Style Language (CSL) JSON schema, as a dict
@@ -119,11 +153,14 @@ class GrobidBiblio:
PMCID=self.pmcid,
ISSN=self.issn,
note=self.note,
- # fields with '-' in the key name
- **{
- "container-title": self.journal,
- "page-first": self.first_page,
- })
+ )
+ # fields with '-' in the key name
+ csl.update({
+ "container-title": self.journal,
+ "book-title": self.book_title,
+ "series-title": self.series_title,
+ "page-first": self.first_page,
+ })
# numeric fields
if self.issue and self.issue.isdigit():
@@ -164,32 +201,24 @@ class GrobidDocument:
Returns a dict in the old "grobid2json" format.
"""
d = self.to_dict()
+ d.pop('header', None)
+ d.update(self.header.to_legacy_dict())
+ if self.citations:
+ d['citations'] = [c.to_legacy_dict() for c in self.citations]
# all header fields at top-level
- header = d.pop('header', {})
d['journal'] = dict(
- name=header.pop('journal', None),
- abbrev=header.pop('journal_abbrev', None),
- publisher=header.pop('publisher', None),
- issn=header.pop('issn', None),
- issne=header.pop('issne', None),
+ name=d.pop('journal', None),
+ publisher=d.pop('publisher', None),
+ issn=d.pop('issn', None),
+ issne=d.pop('issne', None),
+ volume=d.pop('volume', None),
+ issue=d.pop('issue', None),
)
- d.update(header)
- # files not in the old schema
+ # document fields not in the old schema
d.pop('pdf_md5', None)
- for c in d.get('citations', []):
- c.pop('note', None)
- # author changes
- for a in d['authors']:
- a['name'] = a.pop('full_name')
- addr = a.get('affiliation', {}).get('address')
- if addr and addr.get('post_code'):
- addr['postCode'] = addr.pop('post_code')
- for c in d['citations'] or []:
- for a in c['authors']:
- a['name'] = a.pop('full_name')
return _simplify_dict(d)
def remove_encumbered(self) -> None: