From c6daa0aa2d91666308996c4aab8868389e4aafc6 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 3 Nov 2021 20:05:39 -0700 Subject: improve author and editor parsing --- grobid_tei_xml/parse.py | 137 ++++++++++++++++++++++++++++++++---------------- tests/test_parse.py | 8 ++- 2 files changed, 99 insertions(+), 46 deletions(-) diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py index 4916b7f..dea1f2e 100755 --- a/grobid_tei_xml/parse.py +++ b/grobid_tei_xml/parse.py @@ -25,62 +25,112 @@ def _string_to_tree(content: AnyStr) -> ET.ElementTree: raise TypeError(f"expected XML as string or bytes, got: {type(content)}") -def _parse_author(elem: Optional[ET.Element], ns: str = ns) -> Optional[GrobidAuthor]: +def _parse_persname(elem: ET.Element, ns: str = ns) -> GrobidAuthor: """ - Internal helper to parse one or more TEI 'author' XML tags into - GrobidAuthor objects. 'author' could appear in document headers or - citations. + Works on a single persName tag and returns a GrobidAuthor object. + + This is used by both the author and editor parsing code, which insert other + fields from sibling tags. """ if elem is None: return None - persname_tag = elem.find(f"./{{{ns}}}persName") - if persname_tag is None: - # should we do something else here? it is possible to have author - # without persName? - return None # basic author name stuff - # instead create full_name from all the sub-components of the tag - full_name = " ".join([t.strip() for t in persname_tag.itertext() if t.strip()]).strip() + # create full_name from *all* sub-component text + full_name = " ".join([t.strip() for t in elem.itertext() if t.strip()]).strip() ga = GrobidAuthor( full_name=full_name or None, - given_name=persname_tag.findtext(f'./{{{ns}}}forename[@type="first"]'), - middle_name=persname_tag.findtext(f'./{{{ns}}}forename[@type="middle"]'), - surname=persname_tag.findtext(f"./{{{ns}}}surname"), - email=persname_tag.findtext(f"./{{{ns}}}email"), - orcid=elem.findtext(f'.//{{{ns}}}idno[@type="ORCID"]'), + given_name=elem.findtext(f'./{{{ns}}}forename[@type="first"]'), + middle_name=elem.findtext(f'./{{{ns}}}forename[@type="middle"]'), + surname=elem.findtext(f"./{{{ns}}}surname"), ) + return ga + + +def _parse_affiliation(elem: ET.Element, ns: str = ns) -> Optional[GrobidAffiliation]: + + affiliation_dict: Dict[str, Any] = dict() + for orgname_tag in elem.findall(f"./{{{ns}}}orgName"): + orgname_type = orgname_tag.get("type") + if orgname_type: + affiliation_dict[orgname_type] = orgname_tag.text or None + + if not affiliation_dict: + return None + + affiliation = GrobidAffiliation( + institution=affiliation_dict.get("institution"), + department=affiliation_dict.get("department"), + laboratory=affiliation_dict.get("laboratory"), + ) + address_tag = elem.find(f"./{{{ns}}}address") + if address_tag is not None: + address_dict = dict() + for t in list(address_tag): + address_dict[t.tag.split("}")[-1]] = t.text or None + if address_dict: + affiliation.address = GrobidAddress( + addr_line=address_dict.get("addrLine"), + post_code=address_dict.get("postCode"), + settlement=address_dict.get("settlement"), + country=address_dict.get("country"), + ) + return affiliation + + +def _parse_author(elem: ET.Element, ns: str = ns) -> Optional[GrobidAuthor]: + """ + Internal helper to parse a single TEI 'author' XML tag into a GrobidAuthor + objects. + + 'author' could appear in document headers or citations. + """ + + persname_tag = elem.find(f"./{{{ns}}}persName") + if persname_tag is None: + # should we do something else here? it is possible to have author + # without persName? need examples for test coverage + return None + + ga = _parse_persname(persname_tag, ns=ns) + ga.orcid = elem.findtext(f'.//{{{ns}}}idno[@type="ORCID"]') + ga.email = elem.findtext(f"./{{{ns}}}email") # author affiliation affiliation_tag = elem.find(f"./{{{ns}}}affiliation") if affiliation_tag is not None: - affiliation_dict: Dict[str, Any] = dict() - for orgname_tag in affiliation_tag.findall(f"./{{{ns}}}orgName"): - orgname_type = orgname_tag.get("type") - if orgname_type: - affiliation_dict[orgname_type] = orgname_tag.text or None - if affiliation_dict: - ga.affiliation = GrobidAffiliation( - institution=affiliation_dict.get("institution"), - department=affiliation_dict.get("department"), - laboratory=affiliation_dict.get("laboratory"), - ) - address_tag = affiliation_tag.find(f"./{{{ns}}}address") - if address_tag is not None: - address_dict = dict() - for t in list(address_tag): - address_dict[t.tag.split("}")[-1]] = t.text or None - if address_dict: - ga.affiliation.address = GrobidAddress( - addr_line=address_dict.get("addrLine"), - post_code=address_dict.get("postCode"), - settlement=address_dict.get("settlement"), - country=address_dict.get("country"), - ) + ga.affiliation = _parse_affiliation(affiliation_tag, ns=ns) return ga +def _parse_editor(elem: ET.Element, ns: str = ns) -> List[GrobidAuthor]: + """ + Unlike , sometimes contains multiple persName in the single tag. + + Also, sometimes there is no persName, only a bare string under the tag. + + This helper handles all these cases. + """ + + persname_tags = elem.findall(f"./{{{ns}}}persName") + if persname_tags is None: + if elem.find("*") is None: + # sometimes there is a "bare" editor name we can use + raw_name = elem.text + if raw_name and len(raw_name.strip()) >= 2: + return [GrobidAuthor(full_name=raw_name.strip())] + return [] + + persons = [] + for tag in persname_tags: + ga = _parse_persname(tag, ns=ns) + # AFAIK editors don't have affiliation; need test coverage if they do + if ga: + persons.append(ga) + return persons + + def _clean_url(url: Optional[str]) -> Optional[str]: if not url: return None @@ -139,12 +189,11 @@ def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio: editors = [] editor_tags = elem.findall(f".//{{{ns}}}editor") - if not editor_tags: - editor_tags = elem.findall(f'.//{{{ns}}}contributor[@role="editor"]') for elt in editor_tags or []: - e = _parse_author(elt, ns=ns) - if e is not None: - editors.append(e) + editors.extend(_parse_editor(elt, ns=ns)) + contrib_editor_tags = elem.findall(f'.//{{{ns}}}contributor[@role="editor"]') + for cet in contrib_editor_tags or []: + editors.extend(_parse_editor(elt, ns=ns)) biblio = GrobidBiblio( authors=authors, diff --git a/tests/test_parse.py b/tests/test_parse.py index 32d7ea9..203c960 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -387,8 +387,11 @@ def test_citation_list_utf8() -> None: assert ref_str.first_page == "155" assert ref_str.pages == "155-172" + def test_citation_multiple_editors() -> None: - with open("tests/files/citation/single_editor_multiple_persname.grobid070.tei.xml", "r") as f: + with open( + "tests/files/citation/single_editor_multiple_persname.grobid070.tei.xml", "r" + ) as f: tei_xml = f.read() ref = parse_citation_xml(tei_xml) @@ -398,11 +401,12 @@ def test_citation_multiple_editors() -> None: assert ref.authors[0].full_name == "J R Lurain" assert ref.authors[0].middle_name == "R" assert ref.authors[0].surname == "Lurain" + assert ref.editors assert len(ref.editors) == 3 assert ref.editors[0].full_name == "J S Berek" assert ref.editors[1].full_name == "E Y Adashi" assert ref.editors[2].full_name == "P A Hillard" - assert ref.journal == "Novak’s gynecology" + assert ref.book_title == "Novak’s gynecology" assert ref.publisher == "Williams and Wilkins" assert ref.date == "1996" assert ref.note == "12th ed. Baltimore" -- cgit v1.2.3