diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-25 15:46:33 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-25 15:46:38 -0700 |
commit | baa6356b80b1a826eca77f74cc487d947d2fafd4 (patch) | |
tree | 85032c05af7561f2147358e12b8bf2f11136c2d6 /grobid_tei_xml/parse.py | |
parent | 09668907c81492774986e11f0acd9b06090dfbe0 (diff) | |
download | grobid_tei_xml-baa6356b80b1a826eca77f74cc487d947d2fafd4.tar.gz grobid_tei_xml-baa6356b80b1a826eca77f74cc487d947d2fafd4.zip |
schema expansion; grobid v0.7.x examples and test coverage
Diffstat (limited to 'grobid_tei_xml/parse.py')
-rwxr-xr-x | grobid_tei_xml/parse.py | 189 |
1 files changed, 111 insertions, 78 deletions
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py index 1d7eec7..da7ed97 100755 --- a/grobid_tei_xml/parse.py +++ b/grobid_tei_xml/parse.py @@ -25,63 +25,60 @@ def _string_to_tree(content: AnyStr) -> ET.ElementTree: raise TypeError(f"expected XML as string or bytes, got: {type(content)}") -def _parse_authors(elem: Optional[ET.Element], ns: str = ns) -> List[GrobidAuthor]: +def _parse_author(elem: Optional[ET.Element], ns: str = ns) -> Optional[GrobidAuthor]: """ Internal helper to parse one or more TEI 'author' XML tags into GrobidAuthor objects. 'author' could appear in document headers or citations. """ - if not elem: - return [] - authors = [] - for author_tag in elem.findall(f".//{{{ns}}}author"): - persname_tag = author_tag.find(f"./{{{ns}}}persName") - if persname_tag is None: - # should we do something else here? it is possible to have author - # without persName? - continue - - # basic author name stuff - given_name = persname_tag.findtext(f"./{{{ns}}}forename") or None - surname = persname_tag.findtext(f"./{{{ns}}}surname") or None - # instead create full_name from all the sub-components of the tag - full_name = " ".join([t.strip() for t in persname_tag.itertext() if t.strip()]).strip() - ga = GrobidAuthor( - full_name=full_name or None, - given_name=given_name, - surname=surname, - ) - - # author affiliation - affiliation_tag = author_tag.find(f"./{{{ns}}}affiliation") - if affiliation_tag is not None: - affiliation_dict: Dict[str, Any] = dict() - for orgname_tag in affiliation_tag.findall(f"./{{{ns}}}orgName"): - orgname_type = orgname_tag.get("type") - if orgname_type: - affiliation_dict[orgname_type] = orgname_tag.text or None - if affiliation_dict: - ga.affiliation = GrobidAffiliation( - institution=affiliation_dict.get('institution'), - department=affiliation_dict.get('department'), - laboratory=affiliation_dict.get('laboratory'), - ) - address_tag = affiliation_tag.find(f"./{{{ns}}}address") - if address_tag is not None: - address_dict = dict() - for t in list(address_tag): - address_dict[t.tag.split("}")[-1]] = t.text or None - if address_dict: - ga.affiliation.address = GrobidAddress( - addr_line=address_dict.get('addrLine'), - post_code=address_dict.get('postCode'), - settlement=address_dict.get('settlement'), - country=address_dict.get('country'), - ) - authors.append(ga) - - return authors + if elem is None: + return None + persname_tag = elem.find(f"./{{{ns}}}persName") + if persname_tag is None: + # should we do something else here? it is possible to have author + # without persName? + return None + + # basic author name stuff + # instead create full_name from all the sub-components of the tag + full_name = " ".join([t.strip() for t in persname_tag.itertext() if t.strip()]).strip() + ga = GrobidAuthor( + full_name=full_name or None, + given_name=persname_tag.findtext(f'./{{{ns}}}forename[@type="first"]'), + middle_name=persname_tag.findtext(f'./{{{ns}}}forename[@type="middle"]'), + surname=persname_tag.findtext(f"./{{{ns}}}surname"), + email=persname_tag.findtext(f"./{{{ns}}}email"), + orcid=elem.findtext(f'.//{{{ns}}}idno[@type="ORCID"]'), + ) + + # author affiliation + affiliation_tag = elem.find(f"./{{{ns}}}affiliation") + if affiliation_tag is not None: + affiliation_dict: Dict[str, Any] = dict() + for orgname_tag in affiliation_tag.findall(f"./{{{ns}}}orgName"): + orgname_type = orgname_tag.get("type") + if orgname_type: + affiliation_dict[orgname_type] = orgname_tag.text or None + if affiliation_dict: + ga.affiliation = GrobidAffiliation( + institution=affiliation_dict.get('institution'), + department=affiliation_dict.get('department'), + laboratory=affiliation_dict.get('laboratory'), + ) + address_tag = affiliation_tag.find(f"./{{{ns}}}address") + if address_tag is not None: + address_dict = dict() + for t in list(address_tag): + address_dict[t.tag.split("}")[-1]] = t.text or None + if address_dict: + ga.affiliation.address = GrobidAddress( + addr_line=address_dict.get('addrLine'), + post_code=address_dict.get('postCode'), + settlement=address_dict.get('settlement'), + country=address_dict.get('country'), + ) + return ga def _clean_url(url: Optional[str]) -> Optional[str]: @@ -134,45 +131,75 @@ def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio: Could be document header or a citation. """ + authors = [] + for ela in elem.findall(f".//{{{ns}}}author"): + a = _parse_author(ela, ns=ns) + if a is not None: + authors.append(a) + + editors = [] + editor_tags = elem.findall(f'.//{{{ns}}}editor') + if not editor_tags: + editor_tags = elem.findall(f'.//{{{ns}}}contributor[@role="editor"]') + for elt in editor_tags or []: + e = _parse_author(elt, ns=ns) + if e is not None: + editors.append(e) + biblio = GrobidBiblio( - authors=_parse_authors(elem, ns=ns), - id=elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") or None, - unstructured=elem.findtext(f'.//{{{ns}}}note[@type="raw_reference"]') or None, + authors=authors, + editors=editors or None, + id=elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id"), + unstructured=elem.findtext(f'.//{{{ns}}}note[@type="raw_reference"]'), # date below - title=elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") or None, - journal=elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") or None, - journal_abbrev=None, # XXX - publisher=elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") or None, - volume=elem.findtext(f'.//{{{ns}}}biblScope[@unit="volume"]') or None, - issue=elem.findtext(f'.//{{{ns}}}biblScope[@unit="issue"]') or None, + # titles: @level=a for article, @level=m for manuscrupt (book) + title=elem.findtext(f'.//{{{ns}}}title[@type="main"]'), + journal=elem.findtext(f'.//{{{ns}}}title[@level="j"]'), + journal_abbrev=elem.findtext(f'.//{{{ns}}}title[@level="j"][@type="abbrev"]'), + series_title=elem.findtext(f'.//{{{ns}}}title[@level="s"]'), + publisher=elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher"), + institution=elem.findtext(f".//{{{ns}}}respStmt/{{{ns}}}orgName"), + volume=elem.findtext(f'.//{{{ns}}}biblScope[@unit="volume"]'), + issue=elem.findtext(f'.//{{{ns}}}biblScope[@unit="issue"]'), # pages below - # XXX: note - doi=elem.findtext(f'.//{{{ns}}}idno[@type="DOI"]') or None, - pmid=elem.findtext(f'.//{{{ns}}}idno[@type="PMID"]') or None, - pmcid=elem.findtext(f'.//{{{ns}}}idno[@type="PMCID"]') or None, - arxiv_id=elem.findtext(f'.//{{{ns}}}idno[@type="arXiv"]') or None, - issn=elem.findtext(f'.//{{{ns}}}idno[@type="ISSN"]') or None, - eissn=elem.findtext(f'.//{{{ns}}}idno[@type="eISSN"]') or None, + doi=elem.findtext(f'.//{{{ns}}}idno[@type="DOI"]'), + pmid=elem.findtext(f'.//{{{ns}}}idno[@type="PMID"]'), + pmcid=elem.findtext(f'.//{{{ns}}}idno[@type="PMCID"]'), + arxiv_id=elem.findtext(f'.//{{{ns}}}idno[@type="arXiv"]'), + pii=elem.findtext(f'.//{{{ns}}}idno[@type="PII"]'), + ark=elem.findtext(f'.//{{{ns}}}idno[@type="ark"]'), + istex_id=elem.findtext(f'.//{{{ns}}}idno[@type="istexId"]'), + issn=elem.findtext(f'.//{{{ns}}}idno[@type="ISSN"]'), + eissn=elem.findtext(f'.//{{{ns}}}idno[@type="eISSN"]'), ) + book_title_tag = elem.find(f'.//{{{ns}}}title[@level="m"]') + if book_title_tag is not None and book_title_tag.attrib.get('type') is None: + biblio.book_title = book_title_tag.text + if biblio.book_title and not biblio.title: + biblio.title = biblio.book_title + + note_tag = elem.find(f'.//{{{ns}}}note') + if note_tag is not None and note_tag.attrib.get('type') is None: + biblio.note = note_tag.text + if not biblio.publisher: - biblio.publisher = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") or None + biblio.publisher = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") date_tag = elem.find(f'.//{{{ns}}}date[@type="published"]') if date_tag is not None: biblio.date = date_tag.attrib.get("when") or None - # title stuff is messy in references... - if biblio.journal and not biblio.title: - biblio.title = biblio.journal - biblio.journal = None - if biblio.arxiv_id and biblio.arxiv_id.startswith("arXiv:"): biblio.arxiv_id = biblio.arxiv_id[6:] el = elem.find(f'.//{{{ns}}}biblScope[@unit="page"]') if el is not None: + if el.attrib.get("from"): + biblio.first_page = el.attrib["from"] + if el.attrib.get("to"): + biblio.last_page = el.attrib["to"] if el.attrib.get("from") and el.attrib.get("to"): biblio.pages = "{}-{}".format(el.attrib["from"], el.attrib["to"]) else: @@ -205,7 +232,7 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: grobid_version=application_tag.attrib["version"].strip(), grobid_timestamp=application_tag.attrib["when"].strip(), header=_parse_biblio(header), - pdf_md5=header.findtext(f'.//{{{ns}}}idno[@type="MD5"]') or None, + pdf_md5=header.findtext(f'.//{{{ns}}}idno[@type="MD5"]'), ) refs = [] @@ -217,17 +244,23 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument: text = tei.find(f".//{{{ns}}}text") # print(text.attrib) + if text and text.attrib.get(f"{{{xml_ns}}}lang"): + # this is the 'body' language doc.language_code = text.attrib[f"{{{xml_ns}}}lang"] # xml:lang el = tei.find(f".//{{{ns}}}profileDesc/{{{ns}}}abstract") - doc.abstract = (el or None) and " ".join(el.itertext()).strip() + if el is not None: + doc.abstract = " ".join(el.itertext()).strip() or None el = tei.find(f".//{{{ns}}}text/{{{ns}}}body") - doc.body = (el or None) and " ".join(el.itertext()).strip() + if el is not None: + doc.body = " ".join(el.itertext()).strip() or None el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="acknowledgement"]') - doc.acknowledgement = (el or None) and " ".join(el.itertext()).strip() + if el is not None: + doc.acknowledgement = " ".join(el.itertext()).strip() or None el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="annex"]') - doc.annex = (el or None) and " ".join(el.itertext()).strip() + if el is not None: + doc.annex = " ".join(el.itertext()).strip() or None return doc |