diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-22 13:35:19 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-22 13:35:19 -0700 |
commit | 8cd413e2ad07bae6bf3ae940d7c4b94b4be274fa (patch) | |
tree | 12483a2692eb20e8fe69b6137788d5d20c781852 /grobid_tei_xml | |
parent | 5bce48eb6e09decd6cbf20850b3ff674dbcedba9 (diff) | |
download | grobid_tei_xml-8cd413e2ad07bae6bf3ae940d7c4b94b4be274fa.tar.gz grobid_tei_xml-8cd413e2ad07bae6bf3ae940d7c4b94b4be274fa.zip |
bunch of lint and fmt cleanups
Diffstat (limited to 'grobid_tei_xml')
-rw-r--r-- | grobid_tei_xml/__main__.py | 8 | ||||
-rw-r--r-- | grobid_tei_xml/grobid2json.py | 38 | ||||
-rw-r--r-- | grobid_tei_xml/grobid_unstructured.py | 16 | ||||
-rwxr-xr-x | grobid_tei_xml/parse.py | 57 | ||||
-rw-r--r-- | grobid_tei_xml/types.py | 14 |
5 files changed, 45 insertions, 88 deletions
diff --git a/grobid_tei_xml/__main__.py b/grobid_tei_xml/__main__.py index 2d10e84..cc240f4 100644 --- a/grobid_tei_xml/__main__.py +++ b/grobid_tei_xml/__main__.py @@ -13,8 +13,7 @@ def main() -> None: # pragma no cover parser.add_argument( "--no-encumbered", action="store_true", - help= - "don't include ambiguously copyright encumbered fields (eg, abstract, body)", + help="don't include ambiguously copyright encumbered fields (eg, abstract, body)", ) parser.add_argument("teifiles", nargs="+") @@ -24,10 +23,7 @@ def main() -> None: # pragma no cover content = open(filename, "r").read() doc = parse_document_xml(content) if args.no_encumbered: - doc.body = None - doc.annex = None - doc.acknowledgements = None - doc.abstract = None + doc.remove_encumbered() print(json.dumps(doc.to_dict(), sort_keys=True)) diff --git a/grobid_tei_xml/grobid2json.py b/grobid_tei_xml/grobid2json.py index 3c56b19..7f455af 100644 --- a/grobid_tei_xml/grobid2json.py +++ b/grobid_tei_xml/grobid2json.py @@ -26,18 +26,14 @@ This file copied from the sandcrawler repository. """ import io -import json import xml.etree.ElementTree as ET from typing import Any, AnyStr, Dict, List, Optional -from .types import * - xml_ns = "http://www.w3.org/XML/1998/namespace" ns = "http://www.tei-c.org/ns/1.0" -def all_authors(elem: Optional[ET.Element], - ns: str = ns) -> List[Dict[str, Any]]: +def all_authors(elem: Optional[ET.Element], ns: str = ns) -> List[Dict[str, Any]]: if not elem: return [] names = [] @@ -47,8 +43,7 @@ def all_authors(elem: Optional[ET.Element], continue given_name = pn.findtext("./{%s}forename" % ns) or None surname = pn.findtext("./{%s}surname" % ns) or None - full_name = " ".join([t.strip() for t in pn.itertext() - if t.strip()]).strip() + full_name = " ".join([t.strip() for t in pn.itertext() if t.strip()]).strip() obj: Dict[str, Any] = dict(name=full_name) if given_name: obj["given_name"] = given_name @@ -64,7 +59,7 @@ def all_authors(elem: Optional[ET.Element], addr_e = ae.find("./{%s}address" % ns) if addr_e: address = dict() - for t in addr_e.getchildren(): + for t in list(addr_e): address[t.tag.split("}")[-1]] = t.text if address: affiliation["address"] = address @@ -81,8 +76,7 @@ def all_authors(elem: Optional[ET.Element], def journal_info(elem: ET.Element) -> Dict[str, Any]: journal = dict() journal["name"] = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") - journal["publisher"] = elem.findtext( - f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") + journal["publisher"] = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") if journal["publisher"] == "": journal["publisher"] = None journal["issn"] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns) @@ -101,8 +95,7 @@ def journal_info(elem: ET.Element) -> Dict[str, Any]: def biblio_info(elem: ET.Element, ns: str = ns) -> Dict[str, Any]: ref: Dict[str, Any] = dict() ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") - ref["unstructured"] = elem.findtext('.//{%s}note[@type="raw_reference"]' % - ns) + ref["unstructured"] = elem.findtext('.//{%s}note[@type="raw_reference"]' % ns) # Title stuff is messy in references... ref["title"] = elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") other_title = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") @@ -113,11 +106,9 @@ def biblio_info(elem: ET.Element, ns: str = ns) -> Dict[str, Any]: ref["journal"] = None ref["title"] = other_title ref["authors"] = all_authors(elem, ns=ns) - ref["publisher"] = elem.findtext( - f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") + ref["publisher"] = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") if not ref["publisher"]: - ref["publisher"] = elem.findtext( - f".//{{{ns}}}imprint/{{{ns}}}publisher") + ref["publisher"] = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") if ref["publisher"] == "": ref["publisher"] = None date = elem.find('.//{%s}date[@type="published"]' % ns) @@ -168,25 +159,21 @@ def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]: header = tei.find(".//{%s}teiHeader" % ns) if header is None: raise ValueError("XML does not look like TEI format") - application_tag = header.findall( - f".//{{{ns}}}appInfo/{{{ns}}}application")[0] + application_tag = header.findall(f".//{{{ns}}}appInfo/{{{ns}}}application")[0] info["grobid_version"] = application_tag.attrib["version"].strip() info["grobid_timestamp"] = application_tag.attrib["when"].strip() info["title"] = header.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") - info["authors"] = all_authors( - header.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct")) + info["authors"] = all_authors(header.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct")) info["journal"] = journal_info(header) date = header.find('.//{%s}date[@type="published"]' % ns) info["date"] = (date is not None) and date.attrib.get("when") - info["fatcat_release"] = header.findtext('.//{%s}idno[@type="fatcat"]' % - ns) + info["fatcat_release"] = header.findtext('.//{%s}idno[@type="fatcat"]' % ns) info["doi"] = header.findtext('.//{%s}idno[@type="DOI"]' % ns) if info["doi"]: info["doi"] = info["doi"].lower() refs = [] - for (i, bs) in enumerate( - tei.findall(f".//{{{ns}}}listBibl/{{{ns}}}biblStruct")): + for (i, bs) in enumerate(tei.findall(f".//{{{ns}}}listBibl/{{{ns}}}biblStruct")): ref = biblio_info(bs) ref["index"] = i refs.append(ref) @@ -203,8 +190,7 @@ def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]: el = tei.find(f".//{{{ns}}}text/{{{ns}}}body") info["body"] = (el or None) and " ".join(el.itertext()).strip() el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="acknowledgement"]') - info["acknowledgement"] = (el or None) and " ".join( - el.itertext()).strip() + info["acknowledgement"] = (el or None) and " ".join(el.itertext()).strip() el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="annex"]') info["annex"] = (el or None) and " ".join(el.itertext()).strip() diff --git a/grobid_tei_xml/grobid_unstructured.py b/grobid_tei_xml/grobid_unstructured.py deleted file mode 100644 index cbf7322..0000000 --- a/grobid_tei_xml/grobid_unstructured.py +++ /dev/null @@ -1,16 +0,0 @@ -""" -Helper functions to parse an unstructured citation string using GROBID, then -fuzzy match using the result. - -- try to parse string with GROBID REST API call -- transform the GROBID XML response to a simple dict/struct - -TODO: more general versions which handle multiple reference strings in a batch? -""" - -import io -import sys -import xml.etree.ElementTree as ET -from typing import Optional - -from .parse import biblio_info diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py index bbe383f..029fa85 100755 --- a/grobid_tei_xml/parse.py +++ b/grobid_tei_xml/parse.py @@ -1,30 +1,28 @@ import io -import json import xml.etree.ElementTree as ET from typing import Any, AnyStr, Dict, List, Optional -from .types import * +from .types import (GrobidAddress, GrobidAffiliation, GrobidAuthor, GrobidCitation, + GrobidDocument, GrobidHeader, GrobidJournal) xml_ns = "http://www.w3.org/XML/1998/namespace" ns = "http://www.tei-c.org/ns/1.0" -def _string_to_tree(content: AnyStr) -> ET: +def _string_to_tree(content: AnyStr) -> ET.ElementTree: if isinstance(content, str): return ET.parse(io.StringIO(content)) elif isinstance(content, bytes): return ET.parse(io.BytesIO(content)) if isinstance(content, io.StringIO) or isinstance(content, io.BytesIO): return ET.parse(content) - elif isinstance(content, ET): + elif isinstance(content, ET.ElementTree): return content else: - raise TypeError( - f"expected XML as string or bytes, got: {type(content)}") + raise TypeError(f"expected XML as string or bytes, got: {type(content)}") -def _parse_authors(elem: Optional[ET.Element], - ns: str = ns) -> List[GrobidAffiliation]: +def _parse_authors(elem: Optional[ET.Element], ns: str = ns) -> List[GrobidAuthor]: if not elem: return [] names = [] @@ -34,8 +32,7 @@ def _parse_authors(elem: Optional[ET.Element], continue given_name = pn.findtext(f"./{{{ns}}}forename") or None surname = pn.findtext(f"./{{{ns}}}surname") or None - full_name = " ".join([t.strip() for t in pn.itertext() - if t.strip()]).strip() + full_name = " ".join([t.strip() for t in pn.itertext() if t.strip()]).strip() obj: Dict[str, Any] = dict(name=full_name) if given_name: obj["given_name"] = given_name @@ -51,7 +48,7 @@ def _parse_authors(elem: Optional[ET.Element], addr_e = ae.find(f"./{{{ns}}}address") if addr_e: address = dict() - for t in addr_e.getchildren(): + for t in list(addr_e): address[t.tag.split("}")[-1]] = t.text if address: address['post_code'] = address.pop('postCode', None) @@ -70,8 +67,7 @@ def _parse_authors(elem: Optional[ET.Element], def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation: ref: Dict[str, Any] = dict() ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") - ref["unstructured"] = elem.findtext('.//{%s}note[@type="raw_reference"]' % - ns) + ref["unstructured"] = elem.findtext('.//{%s}note[@type="raw_reference"]' % ns) # Title stuff is messy in references... ref["title"] = elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") other_title = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") @@ -82,11 +78,9 @@ def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation: ref["journal"] = None ref["title"] = other_title ref["authors"] = _parse_authors(elem, ns=ns) - ref["publisher"] = elem.findtext( - f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") + ref["publisher"] = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") if not ref["publisher"]: - ref["publisher"] = elem.findtext( - f".//{{{ns}}}imprint/{{{ns}}}publisher") + ref["publisher"] = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") if ref["publisher"] == "": ref["publisher"] = None date = elem.find('.//{%s}date[@type="published"]' % ns) @@ -124,8 +118,7 @@ def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation: def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal: journal = dict() journal["name"] = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") - journal["publisher"] = elem.findtext( - f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") + journal["publisher"] = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") if journal["publisher"] == "": journal["publisher"] = None journal["issn"] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns) @@ -138,16 +131,13 @@ def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal: def _parse_header(elem: ET.Element, ns: str = ns) -> GrobidHeader: header = elem - info = dict() + info: Dict[str, Any] = dict() info["title"] = header.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") - info["authors"] = _parse_authors( - header.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct")) + info["authors"] = _parse_authors(header.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct")) info["journal"] = _parse_journal(header) date = header.find(f'.//{{{ns}}}date[@type="published"]') info["date"] = (date is not None) and date.attrib.get("when") info["doi"] = header.findtext(f'.//{{{ns}}}idno[@type="DOI"]') - if info["doi"]: - info["doi"] = info["doi"].lower() return GrobidHeader(**info) @@ -155,19 +145,17 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: """ Use this function to parse TEI-XML of a full document or header processed by GROBID. - + Eg, the output of '/api/processFulltextDocument' or '/api/processHeader' """ tree = _string_to_tree(xml_text) tei = tree.getroot() - info = dict() header = tei.find(f".//{{{ns}}}teiHeader") if header is None: raise ValueError("XML does not look like TEI format") - application_tag = header.findall( - f".//{{{ns}}}appInfo/{{{ns}}}application")[0] + application_tag = header.findall(f".//{{{ns}}}appInfo/{{{ns}}}application")[0] doc = GrobidDocument( grobid_version=application_tag.attrib["version"].strip(), @@ -177,14 +165,12 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: ) refs = [] - for (i, bs) in enumerate( - tei.findall(f".//{{{ns}}}listBibl/{{{ns}}}biblStruct")): + for (i, bs) in enumerate(tei.findall(f".//{{{ns}}}listBibl/{{{ns}}}biblStruct")): ref = _parse_citation(bs) ref.index = i refs.append(ref) doc.citations = refs - text = tei.find(f".//{{{ns}}}text") # print(text.attrib) if text and text.attrib.get(f"{{{xml_ns}}}lang"): @@ -205,11 +191,14 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: def parse_citations_xml(xml_text: AnyStr) -> List[GrobidCitation]: """ Use this function to parse TEI-XML of one or more references. - + Eg, the output of '/api/processReferences' or '/api/processCitation'. """ # XXX: this replacement shouldn't be needed? - xml_text = xml_text.replace('xmlns="http://www.tei-c.org/ns/1.0"', "") + if isinstance(xml_text, bytes): + xml_text = xml_text.replace(b'xmlns="http://www.tei-c.org/ns/1.0"', b"") + elif isinstance(xml_text, str): + xml_text = xml_text.replace('xmlns="http://www.tei-c.org/ns/1.0"', "") tree = _string_to_tree(xml_text) root = tree.getroot() @@ -219,7 +208,7 @@ def parse_citations_xml(xml_text: AnyStr) -> List[GrobidCitation]: return [ref] refs = [] - for (i, bs) in enumerate(tree.findall(f".//biblStruct")): + for (i, bs) in enumerate(tree.findall(".//biblStruct")): ref = _parse_citation(bs, ns='') ref.index = i refs.append(ref) diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py index e6718c1..b86e1a4 100644 --- a/grobid_tei_xml/types.py +++ b/grobid_tei_xml/types.py @@ -1,5 +1,5 @@ from dataclasses import asdict, dataclass -from typing import Any, AnyStr, Dict, List, Optional +from typing import List, Optional @dataclass @@ -25,7 +25,7 @@ class GrobidAuthor: # TODO: 'forename'? given_name: Optional[str] = None surname: Optional[str] = None - affiliation: Optional[dict] = None + affiliation: Optional[GrobidAffiliation] = None @dataclass @@ -68,11 +68,11 @@ class GrobidJournal: @dataclass class GrobidHeader: + authors: List[GrobidAuthor] title: Optional[str] = None - authors: Optional[str] = None date: Optional[str] = None doi: Optional[str] = None - #TODO: note: Optional[str] + # TODO: note: Optional[str] journal: Optional[GrobidJournal] = None @@ -80,7 +80,7 @@ class GrobidHeader: class GrobidDocument: grobid_version: str grobid_timestamp: str - #TODO: pdf_md5: Optional[str] + # TODO: pdf_md5: Optional[str] header: GrobidHeader citations: Optional[List[GrobidCitation]] = None language_code: Optional[str] = None @@ -115,9 +115,11 @@ def _simplify_dict(d: dict) -> dict: """ Recursively remove empty dict values from a dict and all sub-lists and sub-dicts. + + TODO: should this return Optional[dict]? """ if d in [None, {}, '']: - return None + return {} for k in list(d.keys()): if isinstance(d[k], dict): d[k] = _simplify_dict(d[k]) |