aboutsummaryrefslogtreecommitdiffstats
path: root/grobid_tei_xml/parse.py
diff options
context:
space:
mode:
Diffstat (limited to 'grobid_tei_xml/parse.py')
-rwxr-xr-xgrobid_tei_xml/parse.py37
1 files changed, 18 insertions, 19 deletions
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py
index 66e4e72..cd55f9a 100755
--- a/grobid_tei_xml/parse.py
+++ b/grobid_tei_xml/parse.py
@@ -62,9 +62,9 @@ def _parse_author(elem: Optional[ET.Element], ns: str = ns) -> Optional[GrobidAu
affiliation_dict[orgname_type] = orgname_tag.text or None
if affiliation_dict:
ga.affiliation = GrobidAffiliation(
- institution=affiliation_dict.get('institution'),
- department=affiliation_dict.get('department'),
- laboratory=affiliation_dict.get('laboratory'),
+ institution=affiliation_dict.get("institution"),
+ department=affiliation_dict.get("department"),
+ laboratory=affiliation_dict.get("laboratory"),
)
address_tag = affiliation_tag.find(f"./{{{ns}}}address")
if address_tag is not None:
@@ -73,10 +73,10 @@ def _parse_author(elem: Optional[ET.Element], ns: str = ns) -> Optional[GrobidAu
address_dict[t.tag.split("}")[-1]] = t.text or None
if address_dict:
ga.affiliation.address = GrobidAddress(
- addr_line=address_dict.get('addrLine'),
- post_code=address_dict.get('postCode'),
- settlement=address_dict.get('settlement'),
- country=address_dict.get('country'),
+ addr_line=address_dict.get("addrLine"),
+ post_code=address_dict.get("postCode"),
+ settlement=address_dict.get("settlement"),
+ country=address_dict.get("country"),
)
return ga
@@ -121,7 +121,7 @@ def test_clean_url() -> None:
]
for row in examples:
- assert row['clean'] == _clean_url(row['dirty'])
+ assert row["clean"] == _clean_url(row["dirty"])
def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio:
@@ -138,7 +138,7 @@ def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio:
authors.append(a)
editors = []
- editor_tags = elem.findall(f'.//{{{ns}}}editor')
+ editor_tags = elem.findall(f".//{{{ns}}}editor")
if not editor_tags:
editor_tags = elem.findall(f'.//{{{ns}}}contributor[@role="editor"]')
for elt in editor_tags or []:
@@ -151,7 +151,6 @@ def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio:
editors=editors or None,
id=elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id"),
unstructured=elem.findtext(f'.//{{{ns}}}note[@type="raw_reference"]'),
-
# date below
# titles: @level=a for article, @level=m for manuscrupt (book)
title=elem.findtext(f'.//{{{ns}}}title[@type="main"]'),
@@ -175,14 +174,14 @@ def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio:
)
book_title_tag = elem.find(f'.//{{{ns}}}title[@level="m"]')
- if book_title_tag is not None and book_title_tag.attrib.get('type') is None:
+ if book_title_tag is not None and book_title_tag.attrib.get("type") is None:
biblio.book_title = book_title_tag.text
if biblio.book_title and not biblio.title:
biblio.title = biblio.book_title
biblio.book_title = None
- note_tag = elem.find(f'.//{{{ns}}}note')
- if note_tag is not None and note_tag.attrib.get('type') is None:
+ note_tag = elem.find(f".//{{{ns}}}note")
+ if note_tag is not None and note_tag.attrib.get("type") is None:
biblio.note = note_tag.text
if not biblio.publisher:
@@ -212,7 +211,7 @@ def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio:
# having DOI and a DOI URL is redundant
if biblio.doi and biblio.url:
- if ('://doi.org/' in biblio.url) or ('://dx.doi.org/' in biblio.url):
+ if ("://doi.org/" in biblio.url) or ("://dx.doi.org/" in biblio.url):
biblio.url = None
return biblio
@@ -283,20 +282,20 @@ def parse_citations_xml(xml_text: AnyStr) -> List[GrobidBiblio]:
the namespace.
"""
if isinstance(xml_text, bytes):
- xml_text = xml_text.replace(b'xmlns="http://www.tei-c.org/ns/1.0"', b'')
+ xml_text = xml_text.replace(b'xmlns="http://www.tei-c.org/ns/1.0"', b"")
elif isinstance(xml_text, str):
- xml_text = xml_text.replace('xmlns="http://www.tei-c.org/ns/1.0"', '')
+ xml_text = xml_text.replace('xmlns="http://www.tei-c.org/ns/1.0"', "")
tree = _string_to_tree(xml_text)
root = tree.getroot()
- if root.tag == 'biblStruct':
- ref = _parse_biblio(root, ns='')
+ if root.tag == "biblStruct":
+ ref = _parse_biblio(root, ns="")
ref.index = 0
return [ref]
refs = []
for (i, bs) in enumerate(tree.findall(".//biblStruct")):
- ref = _parse_biblio(bs, ns='')
+ ref = _parse_biblio(bs, ns="")
ref.index = i
refs.append(ref)
return refs