improve author and editor parsing

author: Bryan Newbold <bnewbold@archive.org> 2021-11-03 20:05:39 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-11-03 20:05:39 -0700
commit: c6daa0aa2d91666308996c4aab8868389e4aafc6 (patch)
tree: c28d32da157760ead18718bbf7f8c18337b363a0
parent: 8c7fcc943bc217bf456362b9e205910623974fd0 (diff)
download: grobid_tei_xml-c6daa0aa2d91666308996c4aab8868389e4aafc6.tar.gz
grobid_tei_xml-c6daa0aa2d91666308996c4aab8868389e4aafc6.zip
2 files changed, 99 insertions, 46 deletions
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py
index 4916b7f..dea1f2e 100755
--- a/grobid_tei_xml/parse.py
+++ b/grobid_tei_xml/parse.py
@@ -25,62 +25,112 @@ def _string_to_tree(content: AnyStr) -> ET.ElementTree:
         raise TypeError(f"expected XML as string or bytes, got: {type(content)}")
 
 
-def _parse_author(elem: Optional[ET.Element], ns: str = ns) -> Optional[GrobidAuthor]:
+def _parse_persname(elem: ET.Element, ns: str = ns) -> GrobidAuthor:
     """
-    Internal helper to parse one or more TEI 'author' XML tags into
-    GrobidAuthor objects. 'author' could appear in document headers or
-    citations.
+    Works on a single persName tag and returns a GrobidAuthor object.
+
+    This is used by both the author and editor parsing code, which insert other
+    fields from sibling tags.
     """
 
     if elem is None:
         return None
-    persname_tag = elem.find(f"./{{{ns}}}persName")
-    if persname_tag is None:
-        # should we do something else here? it is possible to have author
-        # without persName?
-        return None
 
     # basic author name stuff
-    # instead create full_name from all the sub-components of the tag
-    full_name = " ".join([t.strip() for t in persname_tag.itertext() if t.strip()]).strip()
+    # create full_name from *all* sub-component text
+    full_name = " ".join([t.strip() for t in elem.itertext() if t.strip()]).strip()
     ga = GrobidAuthor(
         full_name=full_name or None,
-        given_name=persname_tag.findtext(f'./{{{ns}}}forename[@type="first"]'),
-        middle_name=persname_tag.findtext(f'./{{{ns}}}forename[@type="middle"]'),
-        surname=persname_tag.findtext(f"./{{{ns}}}surname"),
-        email=persname_tag.findtext(f"./{{{ns}}}email"),
-        orcid=elem.findtext(f'.//{{{ns}}}idno[@type="ORCID"]'),
+        given_name=elem.findtext(f'./{{{ns}}}forename[@type="first"]'),
+        middle_name=elem.findtext(f'./{{{ns}}}forename[@type="middle"]'),
+        surname=elem.findtext(f"./{{{ns}}}surname"),
     )
+    return ga
+
+
+def _parse_affiliation(elem: ET.Element, ns: str = ns) -> Optional[GrobidAffiliation]:
+
+    affiliation_dict: Dict[str, Any] = dict()
+    for orgname_tag in elem.findall(f"./{{{ns}}}orgName"):
+        orgname_type = orgname_tag.get("type")
+        if orgname_type:
+            affiliation_dict[orgname_type] = orgname_tag.text or None
+
+    if not affiliation_dict:
+        return None
+
+    affiliation = GrobidAffiliation(
+        institution=affiliation_dict.get("institution"),
+        department=affiliation_dict.get("department"),
+        laboratory=affiliation_dict.get("laboratory"),
+    )
+    address_tag = elem.find(f"./{{{ns}}}address")
+    if address_tag is not None:
+        address_dict = dict()
+        for t in list(address_tag):
+            address_dict[t.tag.split("}")[-1]] = t.text or None
+        if address_dict:
+            affiliation.address = GrobidAddress(
+                addr_line=address_dict.get("addrLine"),
+                post_code=address_dict.get("postCode"),
+                settlement=address_dict.get("settlement"),
+                country=address_dict.get("country"),
+            )
+    return affiliation
+
+
+def _parse_author(elem: ET.Element, ns: str = ns) -> Optional[GrobidAuthor]:
+    """
+    Internal helper to parse a single TEI 'author' XML tag into a GrobidAuthor
+    objects.
+
+    'author' could appear in document headers or citations.
+    """
+
+    persname_tag = elem.find(f"./{{{ns}}}persName")
+    if persname_tag is None:
+        # should we do something else here? it is possible to have author
+        # without persName? need examples for test coverage
+        return None
+
+    ga = _parse_persname(persname_tag, ns=ns)
+    ga.orcid = elem.findtext(f'.//{{{ns}}}idno[@type="ORCID"]')
+    ga.email = elem.findtext(f"./{{{ns}}}email")
 
     # author affiliation
     affiliation_tag = elem.find(f"./{{{ns}}}affiliation")
     if affiliation_tag is not None:
-        affiliation_dict: Dict[str, Any] = dict()
-        for orgname_tag in affiliation_tag.findall(f"./{{{ns}}}orgName"):
-            orgname_type = orgname_tag.get("type")
-            if orgname_type:
-                affiliation_dict[orgname_type] = orgname_tag.text or None
-        if affiliation_dict:
-            ga.affiliation = GrobidAffiliation(
-                institution=affiliation_dict.get("institution"),
-                department=affiliation_dict.get("department"),
-                laboratory=affiliation_dict.get("laboratory"),
-            )
-            address_tag = affiliation_tag.find(f"./{{{ns}}}address")
-            if address_tag is not None:
-                address_dict = dict()
-                for t in list(address_tag):
-                    address_dict[t.tag.split("}")[-1]] = t.text or None
-                if address_dict:
-                    ga.affiliation.address = GrobidAddress(
-                        addr_line=address_dict.get("addrLine"),
-                        post_code=address_dict.get("postCode"),
-                        settlement=address_dict.get("settlement"),
-                        country=address_dict.get("country"),
-                    )
+        ga.affiliation = _parse_affiliation(affiliation_tag, ns=ns)
     return ga
 
 
+def _parse_editor(elem: ET.Element, ns: str = ns) -> List[GrobidAuthor]:
+    """
+    Unlike <author>, <editor> sometimes contains multiple persName in the single <editor> tag.
+
+    Also, sometimes there is no persName, only a bare string under the <editor> tag.
+
+    This helper handles all these cases.
+    """
+
+    persname_tags = elem.findall(f"./{{{ns}}}persName")
+    if persname_tags is None:
+        if elem.find("*") is None:
+            # sometimes there is a "bare" editor name we can use
+            raw_name = elem.text
+            if raw_name and len(raw_name.strip()) >= 2:
+                return [GrobidAuthor(full_name=raw_name.strip())]
+        return []
+
+    persons = []
+    for tag in persname_tags:
+        ga = _parse_persname(tag, ns=ns)
+        # AFAIK editors don't have affiliation; need test coverage if they do
+        if ga:
+            persons.append(ga)
+    return persons
+
+
 def _clean_url(url: Optional[str]) -> Optional[str]:
     if not url:
         return None
@@ -139,12 +189,11 @@ def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio:
 
     editors = []
     editor_tags = elem.findall(f".//{{{ns}}}editor")
-    if not editor_tags:
-        editor_tags = elem.findall(f'.//{{{ns}}}contributor[@role="editor"]')
     for elt in editor_tags or []:
-        e = _parse_author(elt, ns=ns)
-        if e is not None:
-            editors.append(e)
+        editors.extend(_parse_editor(elt, ns=ns))
+    contrib_editor_tags = elem.findall(f'.//{{{ns}}}contributor[@role="editor"]')
+    for cet in contrib_editor_tags or []:
+        editors.extend(_parse_editor(elt, ns=ns))
 
     biblio = GrobidBiblio(
         authors=authors,
diff --git a/tests/test_parse.py b/tests/test_parse.py
index 32d7ea9..203c960 100644
--- a/tests/test_parse.py
+++ b/tests/test_parse.py
@@ -387,8 +387,11 @@ def test_citation_list_utf8() -> None:
     assert ref_str.first_page == "155"
     assert ref_str.pages == "155-172"
 
+
 def test_citation_multiple_editors() -> None:
-    with open("tests/files/citation/single_editor_multiple_persname.grobid070.tei.xml", "r") as f:
+    with open(
+        "tests/files/citation/single_editor_multiple_persname.grobid070.tei.xml", "r"
+    ) as f:
         tei_xml = f.read()
 
     ref = parse_citation_xml(tei_xml)
@@ -398,11 +401,12 @@ def test_citation_multiple_editors() -> None:
     assert ref.authors[0].full_name == "J R Lurain"
     assert ref.authors[0].middle_name == "R"
     assert ref.authors[0].surname == "Lurain"
+    assert ref.editors
     assert len(ref.editors) == 3
     assert ref.editors[0].full_name == "J S Berek"
     assert ref.editors[1].full_name == "E Y Adashi"
     assert ref.editors[2].full_name == "P A Hillard"
-    assert ref.journal == "Novak’s gynecology"
+    assert ref.book_title == "Novak’s gynecology"
     assert ref.publisher == "Williams and Wilkins"
     assert ref.date == "1996"
     assert ref.note == "12th ed. Baltimore"
author	Bryan Newbold <bnewbold@archive.org>	2021-11-03 20:05:39 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-11-03 20:05:39 -0700
commit	c6daa0aa2d91666308996c4aab8868389e4aafc6 (patch)
tree	c28d32da157760ead18718bbf7f8c18337b363a0
parent	8c7fcc943bc217bf456362b9e205910623974fd0 (diff)
download	grobid_tei_xml-c6daa0aa2d91666308996c4aab8868389e4aafc6.tar.gz grobid_tei_xml-c6daa0aa2d91666308996c4aab8868389e4aafc6.zip