diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-22 13:35:19 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-22 13:35:19 -0700 |
commit | 8cd413e2ad07bae6bf3ae940d7c4b94b4be274fa (patch) | |
tree | 12483a2692eb20e8fe69b6137788d5d20c781852 /grobid_tei_xml/parse.py | |
parent | 5bce48eb6e09decd6cbf20850b3ff674dbcedba9 (diff) | |
download | grobid_tei_xml-8cd413e2ad07bae6bf3ae940d7c4b94b4be274fa.tar.gz grobid_tei_xml-8cd413e2ad07bae6bf3ae940d7c4b94b4be274fa.zip |
bunch of lint and fmt cleanups
Diffstat (limited to 'grobid_tei_xml/parse.py')
-rwxr-xr-x | grobid_tei_xml/parse.py | 57 |
1 files changed, 23 insertions, 34 deletions
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py index bbe383f..029fa85 100755 --- a/grobid_tei_xml/parse.py +++ b/grobid_tei_xml/parse.py @@ -1,30 +1,28 @@ import io -import json import xml.etree.ElementTree as ET from typing import Any, AnyStr, Dict, List, Optional -from .types import * +from .types import (GrobidAddress, GrobidAffiliation, GrobidAuthor, GrobidCitation, + GrobidDocument, GrobidHeader, GrobidJournal) xml_ns = "http://www.w3.org/XML/1998/namespace" ns = "http://www.tei-c.org/ns/1.0" -def _string_to_tree(content: AnyStr) -> ET: +def _string_to_tree(content: AnyStr) -> ET.ElementTree: if isinstance(content, str): return ET.parse(io.StringIO(content)) elif isinstance(content, bytes): return ET.parse(io.BytesIO(content)) if isinstance(content, io.StringIO) or isinstance(content, io.BytesIO): return ET.parse(content) - elif isinstance(content, ET): + elif isinstance(content, ET.ElementTree): return content else: - raise TypeError( - f"expected XML as string or bytes, got: {type(content)}") + raise TypeError(f"expected XML as string or bytes, got: {type(content)}") -def _parse_authors(elem: Optional[ET.Element], - ns: str = ns) -> List[GrobidAffiliation]: +def _parse_authors(elem: Optional[ET.Element], ns: str = ns) -> List[GrobidAuthor]: if not elem: return [] names = [] @@ -34,8 +32,7 @@ def _parse_authors(elem: Optional[ET.Element], continue given_name = pn.findtext(f"./{{{ns}}}forename") or None surname = pn.findtext(f"./{{{ns}}}surname") or None - full_name = " ".join([t.strip() for t in pn.itertext() - if t.strip()]).strip() + full_name = " ".join([t.strip() for t in pn.itertext() if t.strip()]).strip() obj: Dict[str, Any] = dict(name=full_name) if given_name: obj["given_name"] = given_name @@ -51,7 +48,7 @@ def _parse_authors(elem: Optional[ET.Element], addr_e = ae.find(f"./{{{ns}}}address") if addr_e: address = dict() - for t in addr_e.getchildren(): + for t in list(addr_e): address[t.tag.split("}")[-1]] = t.text if address: address['post_code'] = address.pop('postCode', None) @@ -70,8 +67,7 @@ def _parse_authors(elem: Optional[ET.Element], def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation: ref: Dict[str, Any] = dict() ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") - ref["unstructured"] = elem.findtext('.//{%s}note[@type="raw_reference"]' % - ns) + ref["unstructured"] = elem.findtext('.//{%s}note[@type="raw_reference"]' % ns) # Title stuff is messy in references... ref["title"] = elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") other_title = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") @@ -82,11 +78,9 @@ def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation: ref["journal"] = None ref["title"] = other_title ref["authors"] = _parse_authors(elem, ns=ns) - ref["publisher"] = elem.findtext( - f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") + ref["publisher"] = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") if not ref["publisher"]: - ref["publisher"] = elem.findtext( - f".//{{{ns}}}imprint/{{{ns}}}publisher") + ref["publisher"] = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") if ref["publisher"] == "": ref["publisher"] = None date = elem.find('.//{%s}date[@type="published"]' % ns) @@ -124,8 +118,7 @@ def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation: def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal: journal = dict() journal["name"] = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") - journal["publisher"] = elem.findtext( - f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") + journal["publisher"] = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") if journal["publisher"] == "": journal["publisher"] = None journal["issn"] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns) @@ -138,16 +131,13 @@ def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal: def _parse_header(elem: ET.Element, ns: str = ns) -> GrobidHeader: header = elem - info = dict() + info: Dict[str, Any] = dict() info["title"] = header.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") - info["authors"] = _parse_authors( - header.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct")) + info["authors"] = _parse_authors(header.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct")) info["journal"] = _parse_journal(header) date = header.find(f'.//{{{ns}}}date[@type="published"]') info["date"] = (date is not None) and date.attrib.get("when") info["doi"] = header.findtext(f'.//{{{ns}}}idno[@type="DOI"]') - if info["doi"]: - info["doi"] = info["doi"].lower() return GrobidHeader(**info) @@ -155,19 +145,17 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: """ Use this function to parse TEI-XML of a full document or header processed by GROBID. - + Eg, the output of '/api/processFulltextDocument' or '/api/processHeader' """ tree = _string_to_tree(xml_text) tei = tree.getroot() - info = dict() header = tei.find(f".//{{{ns}}}teiHeader") if header is None: raise ValueError("XML does not look like TEI format") - application_tag = header.findall( - f".//{{{ns}}}appInfo/{{{ns}}}application")[0] + application_tag = header.findall(f".//{{{ns}}}appInfo/{{{ns}}}application")[0] doc = GrobidDocument( grobid_version=application_tag.attrib["version"].strip(), @@ -177,14 +165,12 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: ) refs = [] - for (i, bs) in enumerate( - tei.findall(f".//{{{ns}}}listBibl/{{{ns}}}biblStruct")): + for (i, bs) in enumerate(tei.findall(f".//{{{ns}}}listBibl/{{{ns}}}biblStruct")): ref = _parse_citation(bs) ref.index = i refs.append(ref) doc.citations = refs - text = tei.find(f".//{{{ns}}}text") # print(text.attrib) if text and text.attrib.get(f"{{{xml_ns}}}lang"): @@ -205,11 +191,14 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: def parse_citations_xml(xml_text: AnyStr) -> List[GrobidCitation]: """ Use this function to parse TEI-XML of one or more references. - + Eg, the output of '/api/processReferences' or '/api/processCitation'. """ # XXX: this replacement shouldn't be needed? - xml_text = xml_text.replace('xmlns="http://www.tei-c.org/ns/1.0"', "") + if isinstance(xml_text, bytes): + xml_text = xml_text.replace(b'xmlns="http://www.tei-c.org/ns/1.0"', b"") + elif isinstance(xml_text, str): + xml_text = xml_text.replace('xmlns="http://www.tei-c.org/ns/1.0"', "") tree = _string_to_tree(xml_text) root = tree.getroot() @@ -219,7 +208,7 @@ def parse_citations_xml(xml_text: AnyStr) -> List[GrobidCitation]: return [ref] refs = [] - for (i, bs) in enumerate(tree.findall(f".//biblStruct")): + for (i, bs) in enumerate(tree.findall(".//biblStruct")): ref = _parse_citation(bs, ns='') ref.index = i refs.append(ref) |