summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-11-03 20:05:39 -0700
committerBryan Newbold <bnewbold@archive.org>2021-11-03 20:05:39 -0700
commitc6daa0aa2d91666308996c4aab8868389e4aafc6 (patch)
treec28d32da157760ead18718bbf7f8c18337b363a0
parent8c7fcc943bc217bf456362b9e205910623974fd0 (diff)
downloadgrobid_tei_xml-c6daa0aa2d91666308996c4aab8868389e4aafc6.tar.gz
grobid_tei_xml-c6daa0aa2d91666308996c4aab8868389e4aafc6.zip
improve author and editor parsing
-rwxr-xr-xgrobid_tei_xml/parse.py137
-rw-r--r--tests/test_parse.py8
2 files changed, 99 insertions, 46 deletions
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py
index 4916b7f..dea1f2e 100755
--- a/grobid_tei_xml/parse.py
+++ b/grobid_tei_xml/parse.py
@@ -25,62 +25,112 @@ def _string_to_tree(content: AnyStr) -> ET.ElementTree:
raise TypeError(f"expected XML as string or bytes, got: {type(content)}")
-def _parse_author(elem: Optional[ET.Element], ns: str = ns) -> Optional[GrobidAuthor]:
+def _parse_persname(elem: ET.Element, ns: str = ns) -> GrobidAuthor:
"""
- Internal helper to parse one or more TEI 'author' XML tags into
- GrobidAuthor objects. 'author' could appear in document headers or
- citations.
+ Works on a single persName tag and returns a GrobidAuthor object.
+
+ This is used by both the author and editor parsing code, which insert other
+ fields from sibling tags.
"""
if elem is None:
return None
- persname_tag = elem.find(f"./{{{ns}}}persName")
- if persname_tag is None:
- # should we do something else here? it is possible to have author
- # without persName?
- return None
# basic author name stuff
- # instead create full_name from all the sub-components of the tag
- full_name = " ".join([t.strip() for t in persname_tag.itertext() if t.strip()]).strip()
+ # create full_name from *all* sub-component text
+ full_name = " ".join([t.strip() for t in elem.itertext() if t.strip()]).strip()
ga = GrobidAuthor(
full_name=full_name or None,
- given_name=persname_tag.findtext(f'./{{{ns}}}forename[@type="first"]'),
- middle_name=persname_tag.findtext(f'./{{{ns}}}forename[@type="middle"]'),
- surname=persname_tag.findtext(f"./{{{ns}}}surname"),
- email=persname_tag.findtext(f"./{{{ns}}}email"),
- orcid=elem.findtext(f'.//{{{ns}}}idno[@type="ORCID"]'),
+ given_name=elem.findtext(f'./{{{ns}}}forename[@type="first"]'),
+ middle_name=elem.findtext(f'./{{{ns}}}forename[@type="middle"]'),
+ surname=elem.findtext(f"./{{{ns}}}surname"),
)
+ return ga
+
+
+def _parse_affiliation(elem: ET.Element, ns: str = ns) -> Optional[GrobidAffiliation]:
+
+ affiliation_dict: Dict[str, Any] = dict()
+ for orgname_tag in elem.findall(f"./{{{ns}}}orgName"):
+ orgname_type = orgname_tag.get("type")
+ if orgname_type:
+ affiliation_dict[orgname_type] = orgname_tag.text or None
+
+ if not affiliation_dict:
+ return None
+
+ affiliation = GrobidAffiliation(
+ institution=affiliation_dict.get("institution"),
+ department=affiliation_dict.get("department"),
+ laboratory=affiliation_dict.get("laboratory"),
+ )
+ address_tag = elem.find(f"./{{{ns}}}address")
+ if address_tag is not None:
+ address_dict = dict()
+ for t in list(address_tag):
+ address_dict[t.tag.split("}")[-1]] = t.text or None
+ if address_dict:
+ affiliation.address = GrobidAddress(
+ addr_line=address_dict.get("addrLine"),
+ post_code=address_dict.get("postCode"),
+ settlement=address_dict.get("settlement"),
+ country=address_dict.get("country"),
+ )
+ return affiliation
+
+
+def _parse_author(elem: ET.Element, ns: str = ns) -> Optional[GrobidAuthor]:
+ """
+ Internal helper to parse a single TEI 'author' XML tag into a GrobidAuthor
+ objects.
+
+ 'author' could appear in document headers or citations.
+ """
+
+ persname_tag = elem.find(f"./{{{ns}}}persName")
+ if persname_tag is None:
+ # should we do something else here? it is possible to have author
+ # without persName? need examples for test coverage
+ return None
+
+ ga = _parse_persname(persname_tag, ns=ns)
+ ga.orcid = elem.findtext(f'.//{{{ns}}}idno[@type="ORCID"]')
+ ga.email = elem.findtext(f"./{{{ns}}}email")
# author affiliation
affiliation_tag = elem.find(f"./{{{ns}}}affiliation")
if affiliation_tag is not None:
- affiliation_dict: Dict[str, Any] = dict()
- for orgname_tag in affiliation_tag.findall(f"./{{{ns}}}orgName"):
- orgname_type = orgname_tag.get("type")
- if orgname_type:
- affiliation_dict[orgname_type] = orgname_tag.text or None
- if affiliation_dict:
- ga.affiliation = GrobidAffiliation(
- institution=affiliation_dict.get("institution"),
- department=affiliation_dict.get("department"),
- laboratory=affiliation_dict.get("laboratory"),
- )
- address_tag = affiliation_tag.find(f"./{{{ns}}}address")
- if address_tag is not None:
- address_dict = dict()
- for t in list(address_tag):
- address_dict[t.tag.split("}")[-1]] = t.text or None
- if address_dict:
- ga.affiliation.address = GrobidAddress(
- addr_line=address_dict.get("addrLine"),
- post_code=address_dict.get("postCode"),
- settlement=address_dict.get("settlement"),
- country=address_dict.get("country"),
- )
+ ga.affiliation = _parse_affiliation(affiliation_tag, ns=ns)
return ga
+def _parse_editor(elem: ET.Element, ns: str = ns) -> List[GrobidAuthor]:
+ """
+ Unlike <author>, <editor> sometimes contains multiple persName in the single <editor> tag.
+
+ Also, sometimes there is no persName, only a bare string under the <editor> tag.
+
+ This helper handles all these cases.
+ """
+
+ persname_tags = elem.findall(f"./{{{ns}}}persName")
+ if persname_tags is None:
+ if elem.find("*") is None:
+ # sometimes there is a "bare" editor name we can use
+ raw_name = elem.text
+ if raw_name and len(raw_name.strip()) >= 2:
+ return [GrobidAuthor(full_name=raw_name.strip())]
+ return []
+
+ persons = []
+ for tag in persname_tags:
+ ga = _parse_persname(tag, ns=ns)
+ # AFAIK editors don't have affiliation; need test coverage if they do
+ if ga:
+ persons.append(ga)
+ return persons
+
+
def _clean_url(url: Optional[str]) -> Optional[str]:
if not url:
return None
@@ -139,12 +189,11 @@ def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio:
editors = []
editor_tags = elem.findall(f".//{{{ns}}}editor")
- if not editor_tags:
- editor_tags = elem.findall(f'.//{{{ns}}}contributor[@role="editor"]')
for elt in editor_tags or []:
- e = _parse_author(elt, ns=ns)
- if e is not None:
- editors.append(e)
+ editors.extend(_parse_editor(elt, ns=ns))
+ contrib_editor_tags = elem.findall(f'.//{{{ns}}}contributor[@role="editor"]')
+ for cet in contrib_editor_tags or []:
+ editors.extend(_parse_editor(elt, ns=ns))
biblio = GrobidBiblio(
authors=authors,
diff --git a/tests/test_parse.py b/tests/test_parse.py
index 32d7ea9..203c960 100644
--- a/tests/test_parse.py
+++ b/tests/test_parse.py
@@ -387,8 +387,11 @@ def test_citation_list_utf8() -> None:
assert ref_str.first_page == "155"
assert ref_str.pages == "155-172"
+
def test_citation_multiple_editors() -> None:
- with open("tests/files/citation/single_editor_multiple_persname.grobid070.tei.xml", "r") as f:
+ with open(
+ "tests/files/citation/single_editor_multiple_persname.grobid070.tei.xml", "r"
+ ) as f:
tei_xml = f.read()
ref = parse_citation_xml(tei_xml)
@@ -398,11 +401,12 @@ def test_citation_multiple_editors() -> None:
assert ref.authors[0].full_name == "J R Lurain"
assert ref.authors[0].middle_name == "R"
assert ref.authors[0].surname == "Lurain"
+ assert ref.editors
assert len(ref.editors) == 3
assert ref.editors[0].full_name == "J S Berek"
assert ref.editors[1].full_name == "E Y Adashi"
assert ref.editors[2].full_name == "P A Hillard"
- assert ref.journal == "Novak’s gynecology"
+ assert ref.book_title == "Novak’s gynecology"
assert ref.publisher == "Williams and Wilkins"
assert ref.date == "1996"
assert ref.note == "12th ed. Baltimore"