diff options
-rwxr-xr-x | grobid_tei_xml/grobid2json.py | 44 | ||||
-rw-r--r-- | grobid_tei_xml/grobid_unstructured.py | 1 | ||||
-rw-r--r-- | tests/test_grobid2json.py | 20 | ||||
-rw-r--r-- | tests/test_grobid_unstructured.py | 5 |
4 files changed, 39 insertions, 31 deletions
diff --git a/grobid_tei_xml/grobid2json.py b/grobid_tei_xml/grobid2json.py index ac0710c..5edee36 100755 --- a/grobid_tei_xml/grobid2json.py +++ b/grobid_tei_xml/grobid2json.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 - """ NB: adapted to work as a library for PDF extraction. Will probably be re-written eventually to be correct, complete, and robust; this is just a @@ -25,17 +24,18 @@ Prints JSON to stdout, errors to stderr This file copied from the sandcrawler repository. """ +import argparse import io import json -import argparse import xml.etree.ElementTree as ET -from typing import List, Any, Dict, AnyStr, Optional +from typing import Any, AnyStr, Dict, List, Optional xml_ns = "http://www.w3.org/XML/1998/namespace" ns = "http://www.tei-c.org/ns/1.0" -def all_authors(elem: Optional[ET.Element], ns: str = ns) -> List[Dict[str, Any]]: +def all_authors(elem: Optional[ET.Element], + ns: str = ns) -> List[Dict[str, Any]]: if not elem: return [] names = [] @@ -45,7 +45,8 @@ def all_authors(elem: Optional[ET.Element], ns: str = ns) -> List[Dict[str, Any] continue given_name = pn.findtext("./{%s}forename" % ns) or None surname = pn.findtext("./{%s}surname" % ns) or None - full_name = " ".join([t.strip() for t in pn.itertext() if t.strip()]).strip() + full_name = " ".join([t.strip() for t in pn.itertext() + if t.strip()]).strip() obj: Dict[str, Any] = dict(name=full_name) if given_name: obj["given_name"] = given_name @@ -79,8 +80,7 @@ def journal_info(elem: ET.Element) -> Dict[str, Any]: journal = dict() journal["name"] = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") journal["publisher"] = elem.findtext( - f".//{{{ns}}}publicationStmt/{{{ns}}}publisher" - ) + f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") if journal["publisher"] == "": journal["publisher"] = None journal["issn"] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns) @@ -99,7 +99,8 @@ def journal_info(elem: ET.Element) -> Dict[str, Any]: def biblio_info(elem: ET.Element, ns: str = ns) -> Dict[str, Any]: ref: Dict[str, Any] = dict() ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") - ref["unstructured"] = elem.findtext('.//{%s}note[@type="raw_reference"]' % ns) + ref["unstructured"] = elem.findtext('.//{%s}note[@type="raw_reference"]' % + ns) # Title stuff is messy in references... ref["title"] = elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") other_title = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") @@ -110,9 +111,11 @@ def biblio_info(elem: ET.Element, ns: str = ns) -> Dict[str, Any]: ref["journal"] = None ref["title"] = other_title ref["authors"] = all_authors(elem, ns=ns) - ref["publisher"] = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") + ref["publisher"] = elem.findtext( + f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") if not ref["publisher"]: - ref["publisher"] = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") + ref["publisher"] = elem.findtext( + f".//{{{ns}}}imprint/{{{ns}}}publisher") if ref["publisher"] == "": ref["publisher"] = None date = elem.find('.//{%s}date[@type="published"]' % ns) @@ -162,23 +165,25 @@ def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]: header = tei.find(".//{%s}teiHeader" % ns) if header is None: raise ValueError("XML does not look like TEI format") - application_tag = header.findall(f".//{{{ns}}}appInfo/{{{ns}}}application")[0] + application_tag = header.findall( + f".//{{{ns}}}appInfo/{{{ns}}}application")[0] info["grobid_version"] = application_tag.attrib["version"].strip() info["grobid_timestamp"] = application_tag.attrib["when"].strip() info["title"] = header.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") info["authors"] = all_authors( - header.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct") - ) + header.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct")) info["journal"] = journal_info(header) date = header.find('.//{%s}date[@type="published"]' % ns) info["date"] = (date is not None) and date.attrib.get("when") - info["fatcat_release"] = header.findtext('.//{%s}idno[@type="fatcat"]' % ns) + info["fatcat_release"] = header.findtext('.//{%s}idno[@type="fatcat"]' % + ns) info["doi"] = header.findtext('.//{%s}idno[@type="DOI"]' % ns) if info["doi"]: info["doi"] = info["doi"].lower() refs = [] - for (i, bs) in enumerate(tei.findall(f".//{{{ns}}}listBibl/{{{ns}}}biblStruct")): + for (i, bs) in enumerate( + tei.findall(f".//{{{ns}}}listBibl/{{{ns}}}biblStruct")): ref = biblio_info(bs) ref["index"] = i refs.append(ref) @@ -195,7 +200,8 @@ def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]: el = tei.find(f".//{{{ns}}}text/{{{ns}}}body") info["body"] = (el or None) and " ".join(el.itertext()).strip() el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="acknowledgement"]') - info["acknowledgement"] = (el or None) and " ".join(el.itertext()).strip() + info["acknowledgement"] = (el or None) and " ".join( + el.itertext()).strip() el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="annex"]') info["annex"] = (el or None) and " ".join(el.itertext()).strip() @@ -216,7 +222,8 @@ def main() -> None: # pragma no cover parser.add_argument( "--no-encumbered", action="store_true", - help="don't include ambiguously copyright encumbered fields (eg, abstract, body)", + help= + "don't include ambiguously copyright encumbered fields (eg, abstract, body)", ) parser.add_argument("teifiles", nargs="+") @@ -228,8 +235,7 @@ def main() -> None: # pragma no cover json.dumps( teixml2json(content, encumbered=(not args.no_encumbered)), sort_keys=True, - ) - ) + )) if __name__ == "__main__": # pragma no cover diff --git a/grobid_tei_xml/grobid_unstructured.py b/grobid_tei_xml/grobid_unstructured.py index 83808e0..eeb3507 100644 --- a/grobid_tei_xml/grobid_unstructured.py +++ b/grobid_tei_xml/grobid_unstructured.py @@ -29,4 +29,3 @@ def transform_grobid_ref_xml(raw_xml: str) -> Optional[dict]: if not any(ref.values()): return None return ref - diff --git a/tests/test_grobid2json.py b/tests/test_grobid2json.py index 7ee423c..6e3dac2 100644 --- a/tests/test_grobid2json.py +++ b/tests/test_grobid2json.py @@ -1,4 +1,3 @@ - import xml import json import pytest @@ -6,14 +5,15 @@ from grobid_tei_xml.grobid2json import teixml2json def test_small_xml(): - + with open('tests/files/small.xml', 'r') as f: tei_xml = f.read() with open('tests/files/small.json', 'r') as f: - json_form = json.loads(f.read()) + json_form = json.loads(f.read()) assert teixml2json(tei_xml) == json_form + def test_invalid_xml(): with pytest.raises(xml.etree.ElementTree.ParseError): @@ -30,18 +30,22 @@ def test_grobid_teixml2json() -> None: obj = teixml2json(blob, True) assert ( - obj["title"] - == "Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network" + obj["title"] == + "Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network" ) ref = [c for c in obj["citations"] if c["id"] == "b12"][0] - assert ref["authors"][0] == {"given_name": "K", "name": "K Tasa", "surname": "Tasa"} + assert ref["authors"][0] == { + "given_name": "K", + "name": "K Tasa", + "surname": "Tasa" + } assert ref["journal"] == "Quality Management in Health Care" assert ref["title"] == "Using patient feedback for quality improvement" assert ref["date"] == "1996" assert ref["pages"] == "206-225" assert ref["volume"] == "8" assert ( - ref["unstructured"] - == "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19." + ref["unstructured"] == + "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19." ) diff --git a/tests/test_grobid_unstructured.py b/tests/test_grobid_unstructured.py index b8d79ca..b203b30 100644 --- a/tests/test_grobid_unstructured.py +++ b/tests/test_grobid_unstructured.py @@ -54,7 +54,8 @@ def test_transform_grobid_ref_xml(): </biblStruct>""" d = transform_grobid_ref_xml(citation_xml) - assert d['title'] == "Mesh migration following abdominal hernia repair: a comprehensive review" + assert d[ + 'title'] == "Mesh migration following abdominal hernia repair: a comprehensive review" assert d['authors'][2]['given_name'] == "L" assert d['authors'][2]['surname'] == "Taveras" assert d['authors'][2]['name'] == "L R Taveras" @@ -65,5 +66,3 @@ def test_transform_grobid_ref_xml(): assert d['volume'] == "23" assert d['issue'] == "2" assert d['journal'] == "Hernia" - - |