diff options
Diffstat (limited to 'grobid_tei_xml/grobid2json.py')
-rw-r--r-- | grobid_tei_xml/grobid2json.py | 38 |
1 files changed, 12 insertions, 26 deletions
diff --git a/grobid_tei_xml/grobid2json.py b/grobid_tei_xml/grobid2json.py index 3c56b19..7f455af 100644 --- a/grobid_tei_xml/grobid2json.py +++ b/grobid_tei_xml/grobid2json.py @@ -26,18 +26,14 @@ This file copied from the sandcrawler repository. """ import io -import json import xml.etree.ElementTree as ET from typing import Any, AnyStr, Dict, List, Optional -from .types import * - xml_ns = "http://www.w3.org/XML/1998/namespace" ns = "http://www.tei-c.org/ns/1.0" -def all_authors(elem: Optional[ET.Element], - ns: str = ns) -> List[Dict[str, Any]]: +def all_authors(elem: Optional[ET.Element], ns: str = ns) -> List[Dict[str, Any]]: if not elem: return [] names = [] @@ -47,8 +43,7 @@ def all_authors(elem: Optional[ET.Element], continue given_name = pn.findtext("./{%s}forename" % ns) or None surname = pn.findtext("./{%s}surname" % ns) or None - full_name = " ".join([t.strip() for t in pn.itertext() - if t.strip()]).strip() + full_name = " ".join([t.strip() for t in pn.itertext() if t.strip()]).strip() obj: Dict[str, Any] = dict(name=full_name) if given_name: obj["given_name"] = given_name @@ -64,7 +59,7 @@ def all_authors(elem: Optional[ET.Element], addr_e = ae.find("./{%s}address" % ns) if addr_e: address = dict() - for t in addr_e.getchildren(): + for t in list(addr_e): address[t.tag.split("}")[-1]] = t.text if address: affiliation["address"] = address @@ -81,8 +76,7 @@ def all_authors(elem: Optional[ET.Element], def journal_info(elem: ET.Element) -> Dict[str, Any]: journal = dict() journal["name"] = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") - journal["publisher"] = elem.findtext( - f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") + journal["publisher"] = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") if journal["publisher"] == "": journal["publisher"] = None journal["issn"] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns) @@ -101,8 +95,7 @@ def journal_info(elem: ET.Element) -> Dict[str, Any]: def biblio_info(elem: ET.Element, ns: str = ns) -> Dict[str, Any]: ref: Dict[str, Any] = dict() ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") - ref["unstructured"] = elem.findtext('.//{%s}note[@type="raw_reference"]' % - ns) + ref["unstructured"] = elem.findtext('.//{%s}note[@type="raw_reference"]' % ns) # Title stuff is messy in references... ref["title"] = elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") other_title = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") @@ -113,11 +106,9 @@ def biblio_info(elem: ET.Element, ns: str = ns) -> Dict[str, Any]: ref["journal"] = None ref["title"] = other_title ref["authors"] = all_authors(elem, ns=ns) - ref["publisher"] = elem.findtext( - f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") + ref["publisher"] = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") if not ref["publisher"]: - ref["publisher"] = elem.findtext( - f".//{{{ns}}}imprint/{{{ns}}}publisher") + ref["publisher"] = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") if ref["publisher"] == "": ref["publisher"] = None date = elem.find('.//{%s}date[@type="published"]' % ns) @@ -168,25 +159,21 @@ def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]: header = tei.find(".//{%s}teiHeader" % ns) if header is None: raise ValueError("XML does not look like TEI format") - application_tag = header.findall( - f".//{{{ns}}}appInfo/{{{ns}}}application")[0] + application_tag = header.findall(f".//{{{ns}}}appInfo/{{{ns}}}application")[0] info["grobid_version"] = application_tag.attrib["version"].strip() info["grobid_timestamp"] = application_tag.attrib["when"].strip() info["title"] = header.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") - info["authors"] = all_authors( - header.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct")) + info["authors"] = all_authors(header.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct")) info["journal"] = journal_info(header) date = header.find('.//{%s}date[@type="published"]' % ns) info["date"] = (date is not None) and date.attrib.get("when") - info["fatcat_release"] = header.findtext('.//{%s}idno[@type="fatcat"]' % - ns) + info["fatcat_release"] = header.findtext('.//{%s}idno[@type="fatcat"]' % ns) info["doi"] = header.findtext('.//{%s}idno[@type="DOI"]' % ns) if info["doi"]: info["doi"] = info["doi"].lower() refs = [] - for (i, bs) in enumerate( - tei.findall(f".//{{{ns}}}listBibl/{{{ns}}}biblStruct")): + for (i, bs) in enumerate(tei.findall(f".//{{{ns}}}listBibl/{{{ns}}}biblStruct")): ref = biblio_info(bs) ref["index"] = i refs.append(ref) @@ -203,8 +190,7 @@ def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]: el = tei.find(f".//{{{ns}}}text/{{{ns}}}body") info["body"] = (el or None) and " ".join(el.itertext()).strip() el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="acknowledgement"]') - info["acknowledgement"] = (el or None) and " ".join( - el.itertext()).strip() + info["acknowledgement"] = (el or None) and " ".join(el.itertext()).strip() el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="annex"]') info["annex"] = (el or None) and " ".join(el.itertext()).strip() |