diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-22 13:35:19 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-22 13:35:19 -0700 |
commit | 8cd413e2ad07bae6bf3ae940d7c4b94b4be274fa (patch) | |
tree | 12483a2692eb20e8fe69b6137788d5d20c781852 | |
parent | 5bce48eb6e09decd6cbf20850b3ff674dbcedba9 (diff) | |
download | grobid_tei_xml-8cd413e2ad07bae6bf3ae940d7c4b94b4be274fa.tar.gz grobid_tei_xml-8cd413e2ad07bae6bf3ae940d7c4b94b4be274fa.zip |
bunch of lint and fmt cleanups
-rw-r--r-- | Makefile | 12 | ||||
-rw-r--r-- | grobid_tei_xml/__main__.py | 8 | ||||
-rw-r--r-- | grobid_tei_xml/grobid2json.py | 38 | ||||
-rw-r--r-- | grobid_tei_xml/grobid_unstructured.py | 16 | ||||
-rwxr-xr-x | grobid_tei_xml/parse.py | 57 | ||||
-rw-r--r-- | grobid_tei_xml/types.py | 14 | ||||
-rw-r--r-- | pyproject.toml | 7 | ||||
-rw-r--r-- | tests/test_grobid2json.py | 35 | ||||
-rw-r--r-- | tests/test_parse.py | 99 |
9 files changed, 125 insertions, 161 deletions
@@ -15,8 +15,10 @@ dep: ## Install dependencies using pipenv .PHONY: lint lint: ## Run lints (eg, flake8, mypy) - pipenv run flake8 grobid_tei_xml/ tests/ --exit-zero - pipenv run mypy grobid_tei_xml/ tests/ --ignore-missing-imports + @pipenv run flake8 grobid_tei_xml/ --exit-zero --max-line-length 96 --ignore ANN101 --per-file-ignores="grobid_tei_xml/__init__.py:F401" + @pipenv run flake8 tests/ --ignore F405,F403,E501 --exit-zero --max-line-length 96 + @pipenv run isort -q -c . + @pipenv run mypy grobid_tei_xml/ tests/ --ignore-missing-imports .PHONY: fmt fmt: ## Run code formating on all source code @@ -25,12 +27,12 @@ fmt: ## Run code formating on all source code .PHONY: test test: ## Run all tests and lints - pipenv run python -m pytest -vv + @pipenv run python -m pytest -vv .PHONY: test-readme test-readme: ## Test codeblocks in the README (includes live web requests) - pipenv run python -m pytest --codeblocks + @pipenv run python -m pytest --codeblocks .PHONY: coverage coverage: ## Run all tests with coverage - pipenv run python -m pytest --cov --cov-report=term --cov-report=html + @pipenv run python -m pytest --cov --cov-report=term --cov-report=html diff --git a/grobid_tei_xml/__main__.py b/grobid_tei_xml/__main__.py index 2d10e84..cc240f4 100644 --- a/grobid_tei_xml/__main__.py +++ b/grobid_tei_xml/__main__.py @@ -13,8 +13,7 @@ def main() -> None: # pragma no cover parser.add_argument( "--no-encumbered", action="store_true", - help= - "don't include ambiguously copyright encumbered fields (eg, abstract, body)", + help="don't include ambiguously copyright encumbered fields (eg, abstract, body)", ) parser.add_argument("teifiles", nargs="+") @@ -24,10 +23,7 @@ def main() -> None: # pragma no cover content = open(filename, "r").read() doc = parse_document_xml(content) if args.no_encumbered: - doc.body = None - doc.annex = None - doc.acknowledgements = None - doc.abstract = None + doc.remove_encumbered() print(json.dumps(doc.to_dict(), sort_keys=True)) diff --git a/grobid_tei_xml/grobid2json.py b/grobid_tei_xml/grobid2json.py index 3c56b19..7f455af 100644 --- a/grobid_tei_xml/grobid2json.py +++ b/grobid_tei_xml/grobid2json.py @@ -26,18 +26,14 @@ This file copied from the sandcrawler repository. """ import io -import json import xml.etree.ElementTree as ET from typing import Any, AnyStr, Dict, List, Optional -from .types import * - xml_ns = "http://www.w3.org/XML/1998/namespace" ns = "http://www.tei-c.org/ns/1.0" -def all_authors(elem: Optional[ET.Element], - ns: str = ns) -> List[Dict[str, Any]]: +def all_authors(elem: Optional[ET.Element], ns: str = ns) -> List[Dict[str, Any]]: if not elem: return [] names = [] @@ -47,8 +43,7 @@ def all_authors(elem: Optional[ET.Element], continue given_name = pn.findtext("./{%s}forename" % ns) or None surname = pn.findtext("./{%s}surname" % ns) or None - full_name = " ".join([t.strip() for t in pn.itertext() - if t.strip()]).strip() + full_name = " ".join([t.strip() for t in pn.itertext() if t.strip()]).strip() obj: Dict[str, Any] = dict(name=full_name) if given_name: obj["given_name"] = given_name @@ -64,7 +59,7 @@ def all_authors(elem: Optional[ET.Element], addr_e = ae.find("./{%s}address" % ns) if addr_e: address = dict() - for t in addr_e.getchildren(): + for t in list(addr_e): address[t.tag.split("}")[-1]] = t.text if address: affiliation["address"] = address @@ -81,8 +76,7 @@ def all_authors(elem: Optional[ET.Element], def journal_info(elem: ET.Element) -> Dict[str, Any]: journal = dict() journal["name"] = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") - journal["publisher"] = elem.findtext( - f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") + journal["publisher"] = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") if journal["publisher"] == "": journal["publisher"] = None journal["issn"] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns) @@ -101,8 +95,7 @@ def journal_info(elem: ET.Element) -> Dict[str, Any]: def biblio_info(elem: ET.Element, ns: str = ns) -> Dict[str, Any]: ref: Dict[str, Any] = dict() ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") - ref["unstructured"] = elem.findtext('.//{%s}note[@type="raw_reference"]' % - ns) + ref["unstructured"] = elem.findtext('.//{%s}note[@type="raw_reference"]' % ns) # Title stuff is messy in references... ref["title"] = elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") other_title = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") @@ -113,11 +106,9 @@ def biblio_info(elem: ET.Element, ns: str = ns) -> Dict[str, Any]: ref["journal"] = None ref["title"] = other_title ref["authors"] = all_authors(elem, ns=ns) - ref["publisher"] = elem.findtext( - f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") + ref["publisher"] = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") if not ref["publisher"]: - ref["publisher"] = elem.findtext( - f".//{{{ns}}}imprint/{{{ns}}}publisher") + ref["publisher"] = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") if ref["publisher"] == "": ref["publisher"] = None date = elem.find('.//{%s}date[@type="published"]' % ns) @@ -168,25 +159,21 @@ def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]: header = tei.find(".//{%s}teiHeader" % ns) if header is None: raise ValueError("XML does not look like TEI format") - application_tag = header.findall( - f".//{{{ns}}}appInfo/{{{ns}}}application")[0] + application_tag = header.findall(f".//{{{ns}}}appInfo/{{{ns}}}application")[0] info["grobid_version"] = application_tag.attrib["version"].strip() info["grobid_timestamp"] = application_tag.attrib["when"].strip() info["title"] = header.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") - info["authors"] = all_authors( - header.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct")) + info["authors"] = all_authors(header.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct")) info["journal"] = journal_info(header) date = header.find('.//{%s}date[@type="published"]' % ns) info["date"] = (date is not None) and date.attrib.get("when") - info["fatcat_release"] = header.findtext('.//{%s}idno[@type="fatcat"]' % - ns) + info["fatcat_release"] = header.findtext('.//{%s}idno[@type="fatcat"]' % ns) info["doi"] = header.findtext('.//{%s}idno[@type="DOI"]' % ns) if info["doi"]: info["doi"] = info["doi"].lower() refs = [] - for (i, bs) in enumerate( - tei.findall(f".//{{{ns}}}listBibl/{{{ns}}}biblStruct")): + for (i, bs) in enumerate(tei.findall(f".//{{{ns}}}listBibl/{{{ns}}}biblStruct")): ref = biblio_info(bs) ref["index"] = i refs.append(ref) @@ -203,8 +190,7 @@ def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]: el = tei.find(f".//{{{ns}}}text/{{{ns}}}body") info["body"] = (el or None) and " ".join(el.itertext()).strip() el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="acknowledgement"]') - info["acknowledgement"] = (el or None) and " ".join( - el.itertext()).strip() + info["acknowledgement"] = (el or None) and " ".join(el.itertext()).strip() el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="annex"]') info["annex"] = (el or None) and " ".join(el.itertext()).strip() diff --git a/grobid_tei_xml/grobid_unstructured.py b/grobid_tei_xml/grobid_unstructured.py deleted file mode 100644 index cbf7322..0000000 --- a/grobid_tei_xml/grobid_unstructured.py +++ /dev/null @@ -1,16 +0,0 @@ -""" -Helper functions to parse an unstructured citation string using GROBID, then -fuzzy match using the result. - -- try to parse string with GROBID REST API call -- transform the GROBID XML response to a simple dict/struct - -TODO: more general versions which handle multiple reference strings in a batch? -""" - -import io -import sys -import xml.etree.ElementTree as ET -from typing import Optional - -from .parse import biblio_info diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py index bbe383f..029fa85 100755 --- a/grobid_tei_xml/parse.py +++ b/grobid_tei_xml/parse.py @@ -1,30 +1,28 @@ import io -import json import xml.etree.ElementTree as ET from typing import Any, AnyStr, Dict, List, Optional -from .types import * +from .types import (GrobidAddress, GrobidAffiliation, GrobidAuthor, GrobidCitation, + GrobidDocument, GrobidHeader, GrobidJournal) xml_ns = "http://www.w3.org/XML/1998/namespace" ns = "http://www.tei-c.org/ns/1.0" -def _string_to_tree(content: AnyStr) -> ET: +def _string_to_tree(content: AnyStr) -> ET.ElementTree: if isinstance(content, str): return ET.parse(io.StringIO(content)) elif isinstance(content, bytes): return ET.parse(io.BytesIO(content)) if isinstance(content, io.StringIO) or isinstance(content, io.BytesIO): return ET.parse(content) - elif isinstance(content, ET): + elif isinstance(content, ET.ElementTree): return content else: - raise TypeError( - f"expected XML as string or bytes, got: {type(content)}") + raise TypeError(f"expected XML as string or bytes, got: {type(content)}") -def _parse_authors(elem: Optional[ET.Element], - ns: str = ns) -> List[GrobidAffiliation]: +def _parse_authors(elem: Optional[ET.Element], ns: str = ns) -> List[GrobidAuthor]: if not elem: return [] names = [] @@ -34,8 +32,7 @@ def _parse_authors(elem: Optional[ET.Element], continue given_name = pn.findtext(f"./{{{ns}}}forename") or None surname = pn.findtext(f"./{{{ns}}}surname") or None - full_name = " ".join([t.strip() for t in pn.itertext() - if t.strip()]).strip() + full_name = " ".join([t.strip() for t in pn.itertext() if t.strip()]).strip() obj: Dict[str, Any] = dict(name=full_name) if given_name: obj["given_name"] = given_name @@ -51,7 +48,7 @@ def _parse_authors(elem: Optional[ET.Element], addr_e = ae.find(f"./{{{ns}}}address") if addr_e: address = dict() - for t in addr_e.getchildren(): + for t in list(addr_e): address[t.tag.split("}")[-1]] = t.text if address: address['post_code'] = address.pop('postCode', None) @@ -70,8 +67,7 @@ def _parse_authors(elem: Optional[ET.Element], def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation: ref: Dict[str, Any] = dict() ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") - ref["unstructured"] = elem.findtext('.//{%s}note[@type="raw_reference"]' % - ns) + ref["unstructured"] = elem.findtext('.//{%s}note[@type="raw_reference"]' % ns) # Title stuff is messy in references... ref["title"] = elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") other_title = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") @@ -82,11 +78,9 @@ def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation: ref["journal"] = None ref["title"] = other_title ref["authors"] = _parse_authors(elem, ns=ns) - ref["publisher"] = elem.findtext( - f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") + ref["publisher"] = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") if not ref["publisher"]: - ref["publisher"] = elem.findtext( - f".//{{{ns}}}imprint/{{{ns}}}publisher") + ref["publisher"] = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") if ref["publisher"] == "": ref["publisher"] = None date = elem.find('.//{%s}date[@type="published"]' % ns) @@ -124,8 +118,7 @@ def _parse_citation(elem: ET.Element, ns: str = ns) -> GrobidCitation: def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal: journal = dict() journal["name"] = elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") - journal["publisher"] = elem.findtext( - f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") + journal["publisher"] = elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") if journal["publisher"] == "": journal["publisher"] = None journal["issn"] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns) @@ -138,16 +131,13 @@ def _parse_journal(elem: ET.Element, ns: str = ns) -> GrobidJournal: def _parse_header(elem: ET.Element, ns: str = ns) -> GrobidHeader: header = elem - info = dict() + info: Dict[str, Any] = dict() info["title"] = header.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") - info["authors"] = _parse_authors( - header.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct")) + info["authors"] = _parse_authors(header.find(f".//{{{ns}}}sourceDesc/{{{ns}}}biblStruct")) info["journal"] = _parse_journal(header) date = header.find(f'.//{{{ns}}}date[@type="published"]') info["date"] = (date is not None) and date.attrib.get("when") info["doi"] = header.findtext(f'.//{{{ns}}}idno[@type="DOI"]') - if info["doi"]: - info["doi"] = info["doi"].lower() return GrobidHeader(**info) @@ -155,19 +145,17 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: """ Use this function to parse TEI-XML of a full document or header processed by GROBID. - + Eg, the output of '/api/processFulltextDocument' or '/api/processHeader' """ tree = _string_to_tree(xml_text) tei = tree.getroot() - info = dict() header = tei.find(f".//{{{ns}}}teiHeader") if header is None: raise ValueError("XML does not look like TEI format") - application_tag = header.findall( - f".//{{{ns}}}appInfo/{{{ns}}}application")[0] + application_tag = header.findall(f".//{{{ns}}}appInfo/{{{ns}}}application")[0] doc = GrobidDocument( grobid_version=application_tag.attrib["version"].strip(), @@ -177,14 +165,12 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: ) refs = [] - for (i, bs) in enumerate( - tei.findall(f".//{{{ns}}}listBibl/{{{ns}}}biblStruct")): + for (i, bs) in enumerate(tei.findall(f".//{{{ns}}}listBibl/{{{ns}}}biblStruct")): ref = _parse_citation(bs) ref.index = i refs.append(ref) doc.citations = refs - text = tei.find(f".//{{{ns}}}text") # print(text.attrib) if text and text.attrib.get(f"{{{xml_ns}}}lang"): @@ -205,11 +191,14 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: def parse_citations_xml(xml_text: AnyStr) -> List[GrobidCitation]: """ Use this function to parse TEI-XML of one or more references. - + Eg, the output of '/api/processReferences' or '/api/processCitation'. """ # XXX: this replacement shouldn't be needed? - xml_text = xml_text.replace('xmlns="http://www.tei-c.org/ns/1.0"', "") + if isinstance(xml_text, bytes): + xml_text = xml_text.replace(b'xmlns="http://www.tei-c.org/ns/1.0"', b"") + elif isinstance(xml_text, str): + xml_text = xml_text.replace('xmlns="http://www.tei-c.org/ns/1.0"', "") tree = _string_to_tree(xml_text) root = tree.getroot() @@ -219,7 +208,7 @@ def parse_citations_xml(xml_text: AnyStr) -> List[GrobidCitation]: return [ref] refs = [] - for (i, bs) in enumerate(tree.findall(f".//biblStruct")): + for (i, bs) in enumerate(tree.findall(".//biblStruct")): ref = _parse_citation(bs, ns='') ref.index = i refs.append(ref) diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py index e6718c1..b86e1a4 100644 --- a/grobid_tei_xml/types.py +++ b/grobid_tei_xml/types.py @@ -1,5 +1,5 @@ from dataclasses import asdict, dataclass -from typing import Any, AnyStr, Dict, List, Optional +from typing import List, Optional @dataclass @@ -25,7 +25,7 @@ class GrobidAuthor: # TODO: 'forename'? given_name: Optional[str] = None surname: Optional[str] = None - affiliation: Optional[dict] = None + affiliation: Optional[GrobidAffiliation] = None @dataclass @@ -68,11 +68,11 @@ class GrobidJournal: @dataclass class GrobidHeader: + authors: List[GrobidAuthor] title: Optional[str] = None - authors: Optional[str] = None date: Optional[str] = None doi: Optional[str] = None - #TODO: note: Optional[str] + # TODO: note: Optional[str] journal: Optional[GrobidJournal] = None @@ -80,7 +80,7 @@ class GrobidHeader: class GrobidDocument: grobid_version: str grobid_timestamp: str - #TODO: pdf_md5: Optional[str] + # TODO: pdf_md5: Optional[str] header: GrobidHeader citations: Optional[List[GrobidCitation]] = None language_code: Optional[str] = None @@ -115,9 +115,11 @@ def _simplify_dict(d: dict) -> dict: """ Recursively remove empty dict values from a dict and all sub-lists and sub-dicts. + + TODO: should this return Optional[dict]? """ if d in [None, {}, '']: - return None + return {} for k in list(d.keys()): if isinstance(d[k], dict): d[k] = _simplify_dict(d[k]) diff --git a/pyproject.toml b/pyproject.toml index dc65b2d..7d35cd3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,8 @@ requires = ["setuptools", "wheel"] build-backend = "setuptools.build_meta:__legacy__" -#[tool.pytest.ini_options] -#python_files = tests/*.py grobid_tei_xml/*.py -#log_level = INFO +[tool.yapf] +COLUMN_LIMIT = 96 +[tool.isort] +line_length = 96 diff --git a/tests/test_grobid2json.py b/tests/test_grobid2json.py index a1c975e..47ab293 100644 --- a/tests/test_grobid2json.py +++ b/tests/test_grobid2json.py @@ -1,12 +1,12 @@ -import xml import json +import xml + import pytest -from grobid_tei_xml import teixml2json, parse_document_xml, GrobidDocument, GrobidCitation -from grobid_tei_xml.grobid2json import transform_grobid_ref_xml +from grobid_tei_xml.grobid2json import teixml2json, transform_grobid_ref_xml -def test_small_xml(): +def test_small_xml() -> None: with open('tests/files/small.xml', 'r') as f: tei_xml = f.read() @@ -16,7 +16,7 @@ def test_small_xml(): assert teixml2json(tei_xml) == json_form -def test_invalid_xml(): +def test_invalid_xml() -> None: with pytest.raises(xml.etree.ElementTree.ParseError): teixml2json("this is not XML") @@ -31,29 +31,21 @@ def test_grobid_teixml2json() -> None: obj = teixml2json(blob, True) - assert ( - obj["title"] == - "Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network" - ) + assert obj[ + "title"] == """Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network""" ref = [c for c in obj["citations"] if c["id"] == "b12"][0] - assert ref["authors"][0] == { - "given_name": "K", - "name": "K Tasa", - "surname": "Tasa" - } + assert ref["authors"][0] == {"given_name": "K", "name": "K Tasa", "surname": "Tasa"} assert ref["journal"] == "Quality Management in Health Care" assert ref["title"] == "Using patient feedback for quality improvement" assert ref["date"] == "1996" assert ref["pages"] == "206-225" assert ref["volume"] == "8" - assert ( - ref["unstructured"] == - "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19." - ) + assert ref["unstructured"] == \ + """Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19.""" -def test_transform_grobid_ref_xml(): +def test_transform_grobid_ref_xml() -> None: citation_xml = """ <biblStruct > <analytic> @@ -104,8 +96,9 @@ def test_transform_grobid_ref_xml(): </biblStruct>""" d = transform_grobid_ref_xml(citation_xml) - assert d[ - 'title'] == "Mesh migration following abdominal hernia repair: a comprehensive review" + assert d + assert d['title'] == \ + "Mesh migration following abdominal hernia repair: a comprehensive review" assert d['authors'][2]['given_name'] == "L" assert d['authors'][2]['surname'] == "Taveras" assert d['authors'][2]['name'] == "L R Taveras" diff --git a/tests/test_parse.py b/tests/test_parse.py index e79d41d..30b2926 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -1,17 +1,22 @@ -import xml +import io import json +import xml +import xml.etree.ElementTree + import pytest -from grobid_tei_xml import teixml2json, parse_document_xml, parse_citations_xml, GrobidDocument, GrobidCitation +from grobid_tei_xml import (GrobidCitation, GrobidDocument, parse_citations_xml, + parse_document_xml) from grobid_tei_xml.types import * -def test_small_xml(): +def test_small_xml() -> None: with open('tests/files/small.xml', 'r') as f: tei_xml = f.read() doc = parse_document_xml(tei_xml) + expected_body = """Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.""" expected = GrobidDocument( grobid_version='0.5.1-SNAPSHOT', grobid_timestamp='2018-04-02T00:31+0000', @@ -19,20 +24,19 @@ def test_small_xml(): header=GrobidHeader( title="Dummy Example File", authors=[ - GrobidAuthor( - name="Brewster Kahle", - given_name="Brewster", - surname="Kahle", - affiliation=GrobidAffiliation( - department="Faculty ofAgricultrial Engineering", - laboratory="Plant Physiology Laboratory", - institution="Technion-Israel Institute of Technology", - address=GrobidAddress( - post_code="32000", - settlement="Haifa", - country="Israel", - ), - )), + GrobidAuthor(name="Brewster Kahle", + given_name="Brewster", + surname="Kahle", + affiliation=GrobidAffiliation( + department="Faculty ofAgricultrial Engineering", + laboratory="Plant Physiology Laboratory", + institution="Technion-Israel Institute of Technology", + address=GrobidAddress( + post_code="32000", + settlement="Haifa", + country="Israel", + ), + )), GrobidAuthor( name="J Doe", given_name="J", @@ -40,23 +44,16 @@ def test_small_xml(): ), ], journal=GrobidJournal( - name= - "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", - ), + name="Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678", ), date="2000", ), abstract="Everything you ever wanted to know about nothing", - body= - "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.", + body=expected_body, citations=[ GrobidCitation( index=0, id="b0", - authors=[ - GrobidAuthor(name="A Seaperson", - given_name="A", - surname="Seaperson") - ], + authors=[GrobidAuthor(name="A Seaperson", given_name="A", surname="Seaperson")], date="2001", journal="Letters in the Alphabet", title="Everything is Wonderful", @@ -78,7 +75,7 @@ def test_small_xml(): assert doc == expected -def test_small_xml_json(): +def test_small_xml_json() -> None: with open('tests/files/small.xml', 'r') as f: tei_xml = f.read() @@ -95,13 +92,13 @@ def test_small_xml_json(): # remove nulls from old JSON for c in json_form['citations']: for k in list(c.keys()): - if c[k] == None: + if c[k] is None: c.pop(k) assert d == json_form -def test_invalid_xml(): +def test_invalid_xml() -> None: with pytest.raises(xml.etree.ElementTree.ParseError): parse_document_xml("this is not XML") @@ -109,6 +106,25 @@ def test_invalid_xml(): parse_citations_xml("this is not XML") with pytest.raises(ValueError): parse_document_xml("<xml></xml>") + with pytest.raises(TypeError): + parse_document_xml(123) # type: ignore + + +def test_bytes() -> None: + + with open('tests/files/small.xml', 'rb') as f: + tei_xml = f.read() + + parse_document_xml(tei_xml) + parse_document_xml(io.BytesIO(tei_xml)) # type: ignore + + +def test_elementtree() -> None: + + with open('tests/files/small.xml', 'rb') as f: + tei_xml = f.read() + + parse_document_xml(xml.etree.ElementTree.parse(io.BytesIO(tei_xml))) # type: ignore def test_example_grobid_tei_xml() -> None: @@ -118,12 +134,10 @@ def test_example_grobid_tei_xml() -> None: doc = parse_document_xml(blob) - assert ( - doc.header.title == - "Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network" - ) + assert doc.header.title == \ + """Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network""" - ref = [c for c in doc.citations if c.id == "b12"][0] + ref = [c for c in doc.citations or [] if c.id == "b12"][0] assert ref.authors[0].name == "K Tasa" assert ref.authors[0].given_name == "K" assert ref.authors[0].surname == "Tasa" @@ -132,13 +146,11 @@ def test_example_grobid_tei_xml() -> None: assert ref.date == "1996" assert ref.pages == "206-225" assert ref.volume == "8" - assert ( - ref.unstructured == - "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19." - ) + assert ref.unstructured == \ + """Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19.""" -def test_single_citations_xml(): +def test_single_citations_xml() -> None: citation_xml = """ <biblStruct > <analytic> @@ -189,7 +201,7 @@ def test_single_citations_xml(): </biblStruct>""" d = parse_citations_xml(citation_xml)[0] - assert d.title == "Mesh migration following abdominal hernia repair: a comprehensive review" + assert d.title == """Mesh migration following abdominal hernia repair: a comprehensive review""" assert d.authors[2].given_name == "L" assert d.authors[2].surname == "Taveras" assert d.authors[2].name == "L R Taveras" @@ -202,12 +214,11 @@ def test_single_citations_xml(): assert d.journal == "Hernia" -def test_citation_list_xml(): +def test_citation_list_xml() -> None: with open('tests/files/example_citation_list.xml', 'r') as f: tei_xml = f.read() citations = parse_citations_xml(tei_xml) assert len(citations) == 10 - assert citations[ - 7].title == "Global Hunger Index: The Challenge of Hidden Hunger" + assert citations[7].title == "Global Hunger Index: The Challenge of Hidden Hunger" |