diff options
Diffstat (limited to 'grobid_tei_xml')
-rwxr-xr-x | grobid_tei_xml/parse.py | 189 | ||||
-rw-r--r-- | grobid_tei_xml/types.py | 95 |
2 files changed, 173 insertions, 111 deletions
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py index 1d7eec7..da7ed97 100755 --- a/grobid_tei_xml/parse.py +++ b/grobid_tei_xml/parse.py @@ -25,63 +25,60 @@ def _string_to_tree(content: AnyStr) -> ET.ElementTree: raise TypeError(f"expected XML as string or bytes, got: {type(content)}") -def _parse_authors(elem: Optional[ET.Element], ns: str = ns) -> List[GrobidAuthor]: +def _parse_author(elem: Optional[ET.Element], ns: str = ns) -> Optional[GrobidAuthor]: """ Internal helper to parse one or more TEI 'author' XML tags into GrobidAuthor objects. 'author' could appear in document headers or citations. """ - if not elem: - return [] - authors = [] - for author_tag in elem.findall(f".//{{{ns}}}author"): - persname_tag = author_tag.find(f"./{{{ns}}}persName") - if persname_tag is None: - # should we do something else here? it is possible to have author - # without persName? - continue - - # basic author name stuff - given_name = persname_tag.findtext(f"./{{{ns}}}forename") or None - surname = persname_tag.findtext(f"./{{{ns}}}surname") or None - # instead create full_name from all the sub-components of the tag - full_name = " ".join([t.strip() for t in persname_tag.itertext() if t.strip()]).strip() - ga = GrobidAuthor( - full_name=full_name or None, - given_name=given_name, - surname=surname, - ) - - # author affiliation - affiliation_tag = author_tag.find(f"./{{{ns}}}affiliation") - if affiliation_tag is not None: - affiliation_dict: Dict[str, Any] = dict() - for orgname_tag in affiliation_tag.findall(f"./{{{ns}}}orgName"): - orgname_type = orgname_tag.get("type") - if orgname_type: - affiliation_dict[orgname_type] = orgname_tag.text or None - if affiliation_dict: - ga.affiliation = GrobidAffiliation( - institution=affiliation_dict.get('institution'), - department=affiliation_dict.get('department'), - laboratory=affiliation_dict.get('laboratory'), - ) - address_tag = affiliation_tag.find(f"./{{{ns}}}address") - if address_tag is not None: - address_dict = dict() - for t in list(address_tag): - address_dict[t.tag.split("}")[-1]] = t.text or None - if address_dict: - ga.affiliation.address = GrobidAddress( - addr_line=address_dict.get('addrLine'), - post_code=address_dict.get('postCode'), - settlement=address_dict.get('settlement'), - country=address_dict.get('country'), - ) - authors.append(ga) - - return authors + if elem is None: + return None + persname_tag = elem.find(f"./{{{ns}}}persName") + if persname_tag is None: + # should we do something else here? it is possible to have author + # without persName? + return None + + # basic author name stuff + # instead create full_name from all the sub-components of the tag + full_name = " ".join([t.strip() for t in persname_tag.itertext() if t.strip()]).strip() + ga = GrobidAuthor( + full_name=full_name or None, + given_name=persname_tag.findtext(f'./{{{ns}}}forename[@type="first"]'), + middle_name=persname_tag.findtext(f'./{{{ns}}}forename[@type="middle"]'), + surname=persname_tag.findtext(f"./{{{ns}}}surname"), + email=persname_tag.findtext(f"./{{{ns}}}email"), + orcid=elem.findtext(f'.//{{{ns}}}idno[@type="ORCID"]'), + ) + + # author affiliation + affiliation_tag = elem.find(f"./{{{ns}}}affiliation") + if affiliation_tag is not None: + affiliation_dict: Dict[str, Any] = dict() + for orgname_tag in affiliation_tag.findall(f"./{{{ns}}}orgName"): + orgname_type = orgname_tag.get("type") + if orgname_type: + affiliation_dict[orgname_type] = orgname_tag.text or None + if affiliation_dict: + ga.affiliation = GrobidAffiliation( + institution=affiliation_dict.get('institution'), + department=affiliation_dict.get('department'), + laboratory=affiliation_dict.get('laboratory'), + ) + address_tag = affiliation_tag.find(f"./{{{ns}}}address") + if address_tag is not None: + address_dict = dict() + for t in list(address_tag): + address_dict[t.tag.split("}")[-1]] = t.text or None + if address_dict: + ga.affiliation.address = GrobidAddress( + addr_line=address_dict.get('addrLine'), + post_code=address_dict.get('postCode'), + settlement=address_dict.get('settlement'), + country=address_dict.get('country'), + ) + return ga def _clean_url(url: Optional[str]) -> Optional[str]: @@ -134,45 +131,75 @@ def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio: Could be document header or a citation. """ + authors = [] + for ela in elem.findall(f".//{{{ns}}}author"): + a = _parse_author(ela, ns=ns) + if a is not None: + authors.append(a) + + editors = [] + editor_tags = elem.findall(f'.//{{{ns}}}editor') + if not editor_tags: + editor_tags = elem.findall(f'.//{{{ns}}}contributor[@role="editor"]') + for elt in editor_tags or []: + e = _parse_author(elt, ns=ns) + if e is not None: + editors.append(e) + biblio = GrobidBiblio( - authors=_parse_authors(elem, ns=ns), - id=elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") or None, - unstructured=elem.findtext(f'.//{{{ns}}}note[@type="raw_reference"]') or None, + authors=authors, + editors=editors or None, + id=elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id"), + unstructured=elem.findtext(f'.//{{{ns}}}note[@type="raw_reference"]'), # date below - title=elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") or None, - journal=elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") or None, - journal_abbrev=None, # XXX - publisher=elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") or None, - volume=elem.findtext(f'.//{{{ns}}}biblScope[@unit="volume"]') or None, - issue=elem.findtext(f'.//{{{ns}}}biblScope[@unit="issue"]') or None, + # titles: @level=a for article, @level=m for manuscrupt (book) + title=elem.findtext(f'.//{{{ns}}}title[@type="main"]'), + journal=elem.findtext(f'.//{{{ns}}}title[@level="j"]'), + journal_abbrev=elem.findtext(f'.//{{{ns}}}title[@level="j"][@type="abbrev"]'), + series_title=elem.findtext(f'.//{{{ns}}}title[@level="s"]'), + publisher=elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher"), + institution=elem.findtext(f".//{{{ns}}}respStmt/{{{ns}}}orgName"), + volume=elem.findtext(f'.//{{{ns}}}biblScope[@unit="volume"]'), + issue=elem.findtext(f'.//{{{ns}}}biblScope[@unit="issue"]'), # pages below - # XXX: note - doi=elem.findtext(f'.//{{{ns}}}idno[@type="DOI"]') or None, - pmid=elem.findtext(f'.//{{{ns}}}idno[@type="PMID"]') or None, - pmcid=elem.findtext(f'.//{{{ns}}}idno[@type="PMCID"]') or None, - arxiv_id=elem.findtext(f'.//{{{ns}}}idno[@type="arXiv"]') or None, - issn=elem.findtext(f'.//{{{ns}}}idno[@type="ISSN"]') or None, - eissn=elem.findtext(f'.//{{{ns}}}idno[@type="eISSN"]') or None, + doi=elem.findtext(f'.//{{{ns}}}idno[@type="DOI"]'), + pmid=elem.findtext(f'.//{{{ns}}}idno[@type="PMID"]'), + pmcid=elem.findtext(f'.//{{{ns}}}idno[@type="PMCID"]'), + arxiv_id=elem.findtext(f'.//{{{ns}}}idno[@type="arXiv"]'), + pii=elem.findtext(f'.//{{{ns}}}idno[@type="PII"]'), + ark=elem.findtext(f'.//{{{ns}}}idno[@type="ark"]'), + istex_id=elem.findtext(f'.//{{{ns}}}idno[@type="istexId"]'), + issn=elem.findtext(f'.//{{{ns}}}idno[@type="ISSN"]'), + eissn=elem.findtext(f'.//{{{ns}}}idno[@type="eISSN"]'), ) + book_title_tag = elem.find(f'.//{{{ns}}}title[@level="m"]') + if book_title_tag is not None and book_title_tag.attrib.get('type') is None: + biblio.book_title = book_title_tag.text + if biblio.book_title and not biblio.title: + biblio.title = biblio.book_title + + note_tag = elem.find(f'.//{{{ns}}}note') + if note_tag is not None and note_tag.attrib.get('type') is None: + biblio.note = note_tag.text + if not biblio.publisher: - biblio.publisher = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") or None + biblio.publisher = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") date_tag = elem.find(f'.//{{{ns}}}date[@type="published"]') if date_tag is not None: biblio.date = date_tag.attrib.get("when") or None - # title stuff is messy in references... - if biblio.journal and not biblio.title: - biblio.title = biblio.journal - biblio.journal = None - if biblio.arxiv_id and biblio.arxiv_id.startswith("arXiv:"): biblio.arxiv_id = biblio.arxiv_id[6:] el = elem.find(f'.//{{{ns}}}biblScope[@unit="page"]') if el is not None: + if el.attrib.get("from"): + biblio.first_page = el.attrib["from"] + if el.attrib.get("to"): + biblio.last_page = el.attrib["to"] if el.attrib.get("from") and el.attrib.get("to"): biblio.pages = "{}-{}".format(el.attrib["from"], el.attrib["to"]) else: @@ -205,7 +232,7 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: grobid_version=application_tag.attrib["version"].strip(), grobid_timestamp=application_tag.attrib["when"].strip(), header=_parse_biblio(header), - pdf_md5=header.findtext(f'.//{{{ns}}}idno[@type="MD5"]') or None, + pdf_md5=header.findtext(f'.//{{{ns}}}idno[@type="MD5"]'), ) refs = [] @@ -217,17 +244,23 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: text = tei.find(f".//{{{ns}}}text") # print(text.attrib) + if text and text.attrib.get(f"{{{xml_ns}}}lang"): + # this is the 'body' language doc.language_code = text.attrib[f"{{{xml_ns}}}lang"] # xml:lang el = tei.find(f".//{{{ns}}}profileDesc/{{{ns}}}abstract") - doc.abstract = (el or None) and " ".join(el.itertext()).strip() + if el is not None: + doc.abstract = " ".join(el.itertext()).strip() or None el = tei.find(f".//{{{ns}}}text/{{{ns}}}body") - doc.body = (el or None) and " ".join(el.itertext()).strip() + if el is not None: + doc.body = " ".join(el.itertext()).strip() or None el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="acknowledgement"]') - doc.acknowledgement = (el or None) and " ".join(el.itertext()).strip() + if el is not None: + doc.acknowledgement = " ".join(el.itertext()).strip() or None el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="annex"]') - doc.annex = (el or None) and " ".join(el.itertext()).strip() + if el is not None: + doc.annex = " ".join(el.itertext()).strip() or None return doc diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py index 8356c8e..252e677 100644 --- a/grobid_tei_xml/types.py +++ b/grobid_tei_xml/types.py @@ -8,7 +8,6 @@ class GrobidAddress: post_code: Optional[str] = None settlement: Optional[str] = None country: Optional[str] = None - country_code: Optional[str] = None # XXX @dataclass @@ -23,17 +22,16 @@ class GrobidAffiliation: class GrobidAuthor: full_name: Optional[str] given_name: Optional[str] = None - middle: Optional[str] = None # XXX + middle_name: Optional[str] = None surname: Optional[str] = None - suffix: Optional[str] = None # XXX email: Optional[str] = None # XXX + orcid: Optional[str] = None # XXX affiliation: Optional[GrobidAffiliation] = None def to_csl_dict(self) -> dict: d = dict( - given=self.given_name, + given=self.given_name or self.middle_name, family=self.surname, - suffix=self.suffix, ) return _simplify_dict(d) @@ -79,28 +77,64 @@ class GrobidBiblio: date: Optional[str] = None title: Optional[str] = None - journal: Optional[str] = None # XXX: venue? other? + book_title: Optional[str] = None + series_title: Optional[str] = None + editors: Optional[List[GrobidAuthor]] = None + journal: Optional[str] = None journal_abbrev: Optional[str] = None publisher: Optional[str] = None + institution: Optional[str] = None issn: Optional[str] = None eissn: Optional[str] = None volume: Optional[str] = None issue: Optional[str] = None pages: Optional[str] = None - first_page: Optional[str] = None # XXX - last_page: Optional[str] = None # XXX + first_page: Optional[str] = None + last_page: Optional[str] = None note: Optional[str] = None doi: Optional[str] = None pmid: Optional[str] = None pmcid: Optional[str] = None arxiv_id: Optional[str] = None + pii: Optional[str] = None + ark: Optional[str] = None + istex_id: Optional[str] = None url: Optional[str] = None - oa_url: Optional[str] = None def to_dict(self) -> dict: return _simplify_dict(asdict(self)) + def to_legacy_dict(self) -> dict: + """ + Returns a dict in the old "grobid2json" format. + """ + d = self.to_dict() + + # new keys + d.pop('first_page', None) + d.pop('last_page', None) + d.pop('note', None) + + # legacy book title behavior + if not d.get('journal') and d.get('book_title'): + d['journal'] = d.pop('book_title') + else: + d.pop('book_title', None) + + # author changes + for a in d['authors']: + a['name'] = a.pop('full_name', None) + if not a.get('given_name'): + a['given_name'] = a.pop('middle_name', None) + else: + a.pop('middle_name', None) + addr = a.get('affiliation', {}).get('address') + if addr and addr.get('post_code'): + addr['postCode'] = addr.pop('post_code') + + return _simplify_dict(d) + def to_csl_dict(self, default_type: str = "article-journal") -> dict: """ Transforms in to Citation Style Language (CSL) JSON schema, as a dict @@ -119,11 +153,14 @@ class GrobidBiblio: PMCID=self.pmcid, ISSN=self.issn, note=self.note, - # fields with '-' in the key name - **{ - "container-title": self.journal, - "page-first": self.first_page, - }) + ) + # fields with '-' in the key name + csl.update({ + "container-title": self.journal, + "book-title": self.book_title, + "series-title": self.series_title, + "page-first": self.first_page, + }) # numeric fields if self.issue and self.issue.isdigit(): @@ -164,32 +201,24 @@ class GrobidDocument: Returns a dict in the old "grobid2json" format. """ d = self.to_dict() + d.pop('header', None) + d.update(self.header.to_legacy_dict()) + if self.citations: + d['citations'] = [c.to_legacy_dict() for c in self.citations] # all header fields at top-level - header = d.pop('header', {}) d['journal'] = dict( - name=header.pop('journal', None), - abbrev=header.pop('journal_abbrev', None), - publisher=header.pop('publisher', None), - issn=header.pop('issn', None), - issne=header.pop('issne', None), + name=d.pop('journal', None), + publisher=d.pop('publisher', None), + issn=d.pop('issn', None), + issne=d.pop('issne', None), + volume=d.pop('volume', None), + issue=d.pop('issue', None), ) - d.update(header) - # files not in the old schema + # document fields not in the old schema d.pop('pdf_md5', None) - for c in d.get('citations', []): - c.pop('note', None) - # author changes - for a in d['authors']: - a['name'] = a.pop('full_name') - addr = a.get('affiliation', {}).get('address') - if addr and addr.get('post_code'): - addr['postCode'] = addr.pop('post_code') - for c in d['citations'] or []: - for a in c['authors']: - a['name'] = a.pop('full_name') return _simplify_dict(d) def remove_encumbered(self) -> None: |