diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-27 15:20:11 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-27 15:20:11 -0700 |
commit | 23458a69c2c7db89a66681843e4b9c7f643362c7 (patch) | |
tree | 34e84867092de8364d9978004fbcc051030a2117 | |
parent | 6a34c0b8d5976bb35aecf3c794166e711b338f18 (diff) | |
download | grobid_tei_xml-23458a69c2c7db89a66681843e4b9c7f643362c7.tar.gz grobid_tei_xml-23458a69c2c7db89a66681843e4b9c7f643362c7.zip |
make fmt (black 21.9b0)
-rwxr-xr-x | grobid_tei_xml/parse.py | 37 | ||||
-rw-r--r-- | grobid_tei_xml/types.py | 74 | ||||
-rw-r--r-- | tests/test_csl.py | 12 | ||||
-rw-r--r-- | tests/test_parse.py | 82 |
4 files changed, 108 insertions, 97 deletions
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py index 66e4e72..cd55f9a 100755 --- a/grobid_tei_xml/parse.py +++ b/grobid_tei_xml/parse.py @@ -62,9 +62,9 @@ def _parse_author(elem: Optional[ET.Element], ns: str = ns) -> Optional[GrobidAu affiliation_dict[orgname_type] = orgname_tag.text or None if affiliation_dict: ga.affiliation = GrobidAffiliation( - institution=affiliation_dict.get('institution'), - department=affiliation_dict.get('department'), - laboratory=affiliation_dict.get('laboratory'), + institution=affiliation_dict.get("institution"), + department=affiliation_dict.get("department"), + laboratory=affiliation_dict.get("laboratory"), ) address_tag = affiliation_tag.find(f"./{{{ns}}}address") if address_tag is not None: @@ -73,10 +73,10 @@ def _parse_author(elem: Optional[ET.Element], ns: str = ns) -> Optional[GrobidAu address_dict[t.tag.split("}")[-1]] = t.text or None if address_dict: ga.affiliation.address = GrobidAddress( - addr_line=address_dict.get('addrLine'), - post_code=address_dict.get('postCode'), - settlement=address_dict.get('settlement'), - country=address_dict.get('country'), + addr_line=address_dict.get("addrLine"), + post_code=address_dict.get("postCode"), + settlement=address_dict.get("settlement"), + country=address_dict.get("country"), ) return ga @@ -121,7 +121,7 @@ def test_clean_url() -> None: ] for row in examples: - assert row['clean'] == _clean_url(row['dirty']) + assert row["clean"] == _clean_url(row["dirty"]) def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio: @@ -138,7 +138,7 @@ def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio: authors.append(a) editors = [] - editor_tags = elem.findall(f'.//{{{ns}}}editor') + editor_tags = elem.findall(f".//{{{ns}}}editor") if not editor_tags: editor_tags = elem.findall(f'.//{{{ns}}}contributor[@role="editor"]') for elt in editor_tags or []: @@ -151,7 +151,6 @@ def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio: editors=editors or None, id=elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id"), unstructured=elem.findtext(f'.//{{{ns}}}note[@type="raw_reference"]'), - # date below # titles: @level=a for article, @level=m for manuscrupt (book) title=elem.findtext(f'.//{{{ns}}}title[@type="main"]'), @@ -175,14 +174,14 @@ def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio: ) book_title_tag = elem.find(f'.//{{{ns}}}title[@level="m"]') - if book_title_tag is not None and book_title_tag.attrib.get('type') is None: + if book_title_tag is not None and book_title_tag.attrib.get("type") is None: biblio.book_title = book_title_tag.text if biblio.book_title and not biblio.title: biblio.title = biblio.book_title biblio.book_title = None - note_tag = elem.find(f'.//{{{ns}}}note') - if note_tag is not None and note_tag.attrib.get('type') is None: + note_tag = elem.find(f".//{{{ns}}}note") + if note_tag is not None and note_tag.attrib.get("type") is None: biblio.note = note_tag.text if not biblio.publisher: @@ -212,7 +211,7 @@ def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio: # having DOI and a DOI URL is redundant if biblio.doi and biblio.url: - if ('://doi.org/' in biblio.url) or ('://dx.doi.org/' in biblio.url): + if ("://doi.org/" in biblio.url) or ("://dx.doi.org/" in biblio.url): biblio.url = None return biblio @@ -283,20 +282,20 @@ def parse_citations_xml(xml_text: AnyStr) -> List[GrobidBiblio]: the namespace. """ if isinstance(xml_text, bytes): - xml_text = xml_text.replace(b'xmlns="http://www.tei-c.org/ns/1.0"', b'') + xml_text = xml_text.replace(b'xmlns="http://www.tei-c.org/ns/1.0"', b"") elif isinstance(xml_text, str): - xml_text = xml_text.replace('xmlns="http://www.tei-c.org/ns/1.0"', '') + xml_text = xml_text.replace('xmlns="http://www.tei-c.org/ns/1.0"', "") tree = _string_to_tree(xml_text) root = tree.getroot() - if root.tag == 'biblStruct': - ref = _parse_biblio(root, ns='') + if root.tag == "biblStruct": + ref = _parse_biblio(root, ns="") ref.index = 0 return [ref] refs = [] for (i, bs) in enumerate(tree.findall(".//biblStruct")): - ref = _parse_biblio(bs, ns='') + ref = _parse_biblio(bs, ns="") ref.index = i refs.append(ref) return refs diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py index 08be47a..725871b 100644 --- a/grobid_tei_xml/types.py +++ b/grobid_tei_xml/types.py @@ -47,13 +47,13 @@ def _csl_date(s: Optional[str]) -> Optional[list]: return None # YYYY-MM - if len(s) >= 7 and s[4] == '-' and s[5:7].isdigit(): + if len(s) >= 7 and s[4] == "-" and s[5:7].isdigit(): month = int(s[5:7]) else: return [[year]] # YYYY-MM-DD - if len(s) == 10 and s[7] == '-' and s[8:10].isdigit(): + if len(s) == 10 and s[7] == "-" and s[8:10].isdigit(): day = int(s[8:10]) return [[year, month, day]] else: @@ -112,26 +112,26 @@ class GrobidBiblio: d = self.to_dict() # new keys - d.pop('first_page', None) - d.pop('last_page', None) - d.pop('note', None) + d.pop("first_page", None) + d.pop("last_page", None) + d.pop("note", None) # legacy book title behavior - if not d.get('journal') and d.get('book_title'): - d['journal'] = d.pop('book_title') + if not d.get("journal") and d.get("book_title"): + d["journal"] = d.pop("book_title") else: - d.pop('book_title', None) + d.pop("book_title", None) # author changes - for a in d['authors']: - a['name'] = a.pop('full_name', None) - if not a.get('given_name'): - a['given_name'] = a.pop('middle_name', None) + for a in d["authors"]: + a["name"] = a.pop("full_name", None) + if not a.get("given_name"): + a["given_name"] = a.pop("middle_name", None) else: - a.pop('middle_name', None) - addr = a.get('affiliation', {}).get('address') - if addr and addr.get('post_code'): - addr['postCode'] = addr.pop('post_code') + a.pop("middle_name", None) + addr = a.get("affiliation", {}).get("address") + if addr and addr.get("post_code"): + addr["postCode"] = addr.pop("post_code") return _simplify_dict(d) @@ -155,18 +155,20 @@ class GrobidBiblio: note=self.note, ) # fields with '-' in the key name - csl.update({ - "container-title": self.journal, - "book-title": self.book_title, - "series-title": self.series_title, - "page-first": self.first_page, - }) + csl.update( + { + "container-title": self.journal, + "book-title": self.book_title, + "series-title": self.series_title, + "page-first": self.first_page, + } + ) # numeric fields if self.issue and self.issue.isdigit(): - csl['issue'] = int(self.issue) + csl["issue"] = int(self.issue) if self.volume and self.volume.isdigit(): - csl['volume'] = int(self.volume) + csl["volume"] = int(self.volume) return _simplify_dict(csl) @@ -201,23 +203,23 @@ class GrobidDocument: Returns a dict in the old "grobid2json" format. """ d = self.to_dict() - d.pop('header', None) + d.pop("header", None) d.update(self.header.to_legacy_dict()) if self.citations: - d['citations'] = [c.to_legacy_dict() for c in self.citations] + d["citations"] = [c.to_legacy_dict() for c in self.citations] # all header fields at top-level - d['journal'] = dict( - name=d.pop('journal', None), - publisher=d.pop('publisher', None), - issn=d.pop('issn', None), - issne=d.pop('issne', None), - volume=d.pop('volume', None), - issue=d.pop('issue', None), + d["journal"] = dict( + name=d.pop("journal", None), + publisher=d.pop("publisher", None), + issn=d.pop("issn", None), + issne=d.pop("issne", None), + volume=d.pop("volume", None), + issue=d.pop("issue", None), ) # document fields not in the old schema - d.pop('pdf_md5', None) + d.pop("pdf_md5", None) return _simplify_dict(d) @@ -246,7 +248,7 @@ def _simplify_dict(d: dict) -> dict: TODO: should this return Optional[dict]? """ - if d in [None, {}, '']: + if d in [None, {}, ""]: return {} for k in list(d.keys()): if isinstance(d[k], dict): @@ -255,6 +257,6 @@ def _simplify_dict(d: dict) -> dict: for i in range(len(d[k])): if isinstance(d[k][i], dict): d[k][i] = _simplify_dict(d[k][i]) - if d[k] in [None, {}, '']: + if d[k] in [None, {}, ""]: d.pop(k) return d diff --git a/tests/test_csl.py b/tests/test_csl.py index 27c8c3e..e8ded91 100644 --- a/tests/test_csl.py +++ b/tests/test_csl.py @@ -3,7 +3,7 @@ from grobid_tei_xml import parse_document_xml def test_small_xml_csl() -> None: - with open('tests/files/small.xml', 'r') as f: + with open("tests/files/small.xml", "r") as f: tei_xml = f.read() d = parse_document_xml(tei_xml) @@ -11,10 +11,7 @@ def test_small_xml_csl() -> None: "type": "article-journal", "title": "Dummy Example File", "author": [ - { - "given": "Brewster", - "family": "Kahle" - }, + {"given": "Brewster", "family": "Kahle"}, { "given": "J", "family": "Doe", @@ -29,10 +26,7 @@ def test_small_xml_csl() -> None: "type": "article-journal", "title": "Everything is Wonderful", "author": [ - { - "given": "A", - "family": "Seaperson" - }, + {"given": "A", "family": "Seaperson"}, ], "container-title": "Letters in the Alphabet", "issued": [[2001]], diff --git a/tests/test_parse.py b/tests/test_parse.py index 976d1b1..25529c4 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -11,31 +11,33 @@ from grobid_tei_xml.types import * def test_small_xml() -> None: - with open('tests/files/small.xml', 'r') as f: + with open("tests/files/small.xml", "r") as f: tei_xml = f.read() doc = parse_document_xml(tei_xml) expected_body = """Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.""" expected = GrobidDocument( - grobid_version='0.5.1-SNAPSHOT', - grobid_timestamp='2018-04-02T00:31+0000', - language_code='en', + grobid_version="0.5.1-SNAPSHOT", + grobid_timestamp="2018-04-02T00:31+0000", + language_code="en", header=GrobidBiblio( title="Dummy Example File", authors=[ - GrobidAuthor(full_name="Brewster Kahle", - given_name="Brewster", - surname="Kahle", - affiliation=GrobidAffiliation( - department="Faculty ofAgricultrial Engineering", - laboratory="Plant Physiology Laboratory", - institution="Technion-Israel Institute of Technology", - address=GrobidAddress( - post_code="32000", - settlement="Haifa", - country="Israel", - ), - )), + GrobidAuthor( + full_name="Brewster Kahle", + given_name="Brewster", + surname="Kahle", + affiliation=GrobidAffiliation( + department="Faculty ofAgricultrial Engineering", + laboratory="Plant Physiology Laboratory", + institution="Technion-Israel Institute of Technology", + address=GrobidAddress( + post_code="32000", + settlement="Haifa", + country="Israel", + ), + ), + ), GrobidAuthor( full_name="J Doe", given_name="J", @@ -80,9 +82,9 @@ def test_small_xml() -> None: def test_small_xml_legacy() -> None: - with open('tests/files/small.xml', 'r') as f: + with open("tests/files/small.xml", "r") as f: tei_xml = f.read() - with open('tests/files/small.json', 'r') as f: + with open("tests/files/small.json", "r") as f: json_form = json.loads(f.read()) d = parse_document_xml(tei_xml).to_legacy_dict() @@ -104,7 +106,7 @@ def test_invalid_xml() -> None: def test_bytes() -> None: - with open('tests/files/small.xml', 'rb') as f: + with open("tests/files/small.xml", "rb") as f: tei_xml = f.read() parse_document_xml(tei_xml) @@ -113,7 +115,7 @@ def test_bytes() -> None: def test_elementtree() -> None: - with open('tests/files/small.xml', 'rb') as f: + with open("tests/files/small.xml", "rb") as f: tei_xml = f.read() parse_document_xml(xml.etree.ElementTree.parse(io.BytesIO(tei_xml))) # type: ignore @@ -126,8 +128,10 @@ def test_example_grobid_tei_xml() -> None: doc = parse_document_xml(blob) - assert doc.header.title == \ - """Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network""" + assert ( + doc.header.title + == """Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network""" + ) ref = [c for c in doc.citations or [] if c.id == "b12"][0] assert ref.authors[0].full_name == "K Tasa" @@ -138,8 +142,10 @@ def test_example_grobid_tei_xml() -> None: assert ref.date == "1996" assert ref.pages == "206-225" assert ref.volume == "8" - assert ref.unstructured == \ - """Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19.""" + assert ( + ref.unstructured + == """Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19.""" + ) def test_single_citations_xml() -> None: @@ -193,7 +199,10 @@ def test_single_citations_xml() -> None: </biblStruct>""" d = parse_citations_xml(citation_xml)[0] - assert d.title == """Mesh migration following abdominal hernia repair: a comprehensive review""" + assert ( + d.title + == """Mesh migration following abdominal hernia repair: a comprehensive review""" + ) assert d.authors[2].given_name == "L" assert d.authors[2].middle_name == "R" assert d.authors[2].surname == "Taveras" @@ -211,7 +220,7 @@ def test_single_citations_xml() -> None: def test_citation_list_xml() -> None: - with open('tests/files/example_citation_list.xml', 'r') as f: + with open("tests/files/example_citation_list.xml", "r") as f: tei_xml = f.read() citations = parse_citations_xml(tei_xml) @@ -237,17 +246,21 @@ def test_citation_list_xml() -> None: assert citations[11].series_title == "Handbook of Optics" assert citations[11].publisher == "McGRAW-HILL" - assert citations[ - 12].title == "Implications of abandoned shoreline features above Glacial Lake Duluth levels along the north shore of the Superior Basin in the vicinity of the Brule River" - assert citations[ - 12].book_title == "Paper presented at the 13th Biennial Meeting of the American Quaternary Association" + assert ( + citations[12].title + == "Implications of abandoned shoreline features above Glacial Lake Duluth levels along the north shore of the Superior Basin in the vicinity of the Brule River" + ) + assert ( + citations[12].book_title + == "Paper presented at the 13th Biennial Meeting of the American Quaternary Association" + ) assert citations[12].institution == "University of Minnesota" def test_grobid_070_document() -> None: # more recent GROBID v0.7.0 output - with open('tests/files/example_grobid_plos.tei.xml', 'r') as f: + with open("tests/files/example_grobid_plos.tei.xml", "r") as f: tei_xml = f.read() doc = parse_document_xml(tei_xml) @@ -267,7 +280,10 @@ def test_grobid_070_document() -> None: cite_b3 = doc.citations[3] assert cite_b3.url == "http://unesdoc.unesco.org/ulis/" - assert cite_b3.title == "Requirements for Global Implementation of the Strategic Plan for Coastal GOOS" + assert ( + cite_b3.title + == "Requirements for Global Implementation of the Strategic Plan for Coastal GOOS" + ) assert cite_b3.authors assert cite_b3.authors[0].surname == "Ioc-Unesco" assert cite_b3.date == "2012" |