diff options
Diffstat (limited to 'grobid_tei_xml/parse.py')
-rwxr-xr-x | grobid_tei_xml/parse.py | 37 |
1 files changed, 18 insertions, 19 deletions
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py index 66e4e72..cd55f9a 100755 --- a/grobid_tei_xml/parse.py +++ b/grobid_tei_xml/parse.py @@ -62,9 +62,9 @@ def _parse_author(elem: Optional[ET.Element], ns: str = ns) -> Optional[GrobidAu affiliation_dict[orgname_type] = orgname_tag.text or None if affiliation_dict: ga.affiliation = GrobidAffiliation( - institution=affiliation_dict.get('institution'), - department=affiliation_dict.get('department'), - laboratory=affiliation_dict.get('laboratory'), + institution=affiliation_dict.get("institution"), + department=affiliation_dict.get("department"), + laboratory=affiliation_dict.get("laboratory"), ) address_tag = affiliation_tag.find(f"./{{{ns}}}address") if address_tag is not None: @@ -73,10 +73,10 @@ def _parse_author(elem: Optional[ET.Element], ns: str = ns) -> Optional[GrobidAu address_dict[t.tag.split("}")[-1]] = t.text or None if address_dict: ga.affiliation.address = GrobidAddress( - addr_line=address_dict.get('addrLine'), - post_code=address_dict.get('postCode'), - settlement=address_dict.get('settlement'), - country=address_dict.get('country'), + addr_line=address_dict.get("addrLine"), + post_code=address_dict.get("postCode"), + settlement=address_dict.get("settlement"), + country=address_dict.get("country"), ) return ga @@ -121,7 +121,7 @@ def test_clean_url() -> None: ] for row in examples: - assert row['clean'] == _clean_url(row['dirty']) + assert row["clean"] == _clean_url(row["dirty"]) def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio: @@ -138,7 +138,7 @@ def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio: authors.append(a) editors = [] - editor_tags = elem.findall(f'.//{{{ns}}}editor') + editor_tags = elem.findall(f".//{{{ns}}}editor") if not editor_tags: editor_tags = elem.findall(f'.//{{{ns}}}contributor[@role="editor"]') for elt in editor_tags or []: @@ -151,7 +151,6 @@ def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio: editors=editors or None, id=elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id"), unstructured=elem.findtext(f'.//{{{ns}}}note[@type="raw_reference"]'), - # date below # titles: @level=a for article, @level=m for manuscrupt (book) title=elem.findtext(f'.//{{{ns}}}title[@type="main"]'), @@ -175,14 +174,14 @@ def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio: ) book_title_tag = elem.find(f'.//{{{ns}}}title[@level="m"]') - if book_title_tag is not None and book_title_tag.attrib.get('type') is None: + if book_title_tag is not None and book_title_tag.attrib.get("type") is None: biblio.book_title = book_title_tag.text if biblio.book_title and not biblio.title: biblio.title = biblio.book_title biblio.book_title = None - note_tag = elem.find(f'.//{{{ns}}}note') - if note_tag is not None and note_tag.attrib.get('type') is None: + note_tag = elem.find(f".//{{{ns}}}note") + if note_tag is not None and note_tag.attrib.get("type") is None: biblio.note = note_tag.text if not biblio.publisher: @@ -212,7 +211,7 @@ def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio: # having DOI and a DOI URL is redundant if biblio.doi and biblio.url: - if ('://doi.org/' in biblio.url) or ('://dx.doi.org/' in biblio.url): + if ("://doi.org/" in biblio.url) or ("://dx.doi.org/" in biblio.url): biblio.url = None return biblio @@ -283,20 +282,20 @@ def parse_citations_xml(xml_text: AnyStr) -> List[GrobidBiblio]: the namespace. """ if isinstance(xml_text, bytes): - xml_text = xml_text.replace(b'xmlns="http://www.tei-c.org/ns/1.0"', b'') + xml_text = xml_text.replace(b'xmlns="http://www.tei-c.org/ns/1.0"', b"") elif isinstance(xml_text, str): - xml_text = xml_text.replace('xmlns="http://www.tei-c.org/ns/1.0"', '') + xml_text = xml_text.replace('xmlns="http://www.tei-c.org/ns/1.0"', "") tree = _string_to_tree(xml_text) root = tree.getroot() - if root.tag == 'biblStruct': - ref = _parse_biblio(root, ns='') + if root.tag == "biblStruct": + ref = _parse_biblio(root, ns="") ref.index = 0 return [ref] refs = [] for (i, bs) in enumerate(tree.findall(".//biblStruct")): - ref = _parse_biblio(bs, ns='') + ref = _parse_biblio(bs, ns="") ref.index = i refs.append(ref) return refs |