schema expansion; grobid v0.7.x examples and test coverage

author: Bryan Newbold <bnewbold@archive.org> 2021-10-25 15:46:33 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-10-25 15:46:38 -0700
commit: baa6356b80b1a826eca77f74cc487d947d2fafd4 (patch)
tree: 85032c05af7561f2147358e12b8bf2f11136c2d6 /grobid_tei_xml/parse.py
parent: 09668907c81492774986e11f0acd9b06090dfbe0 (diff)
download: grobid_tei_xml-baa6356b80b1a826eca77f74cc487d947d2fafd4.tar.gz
grobid_tei_xml-baa6356b80b1a826eca77f74cc487d947d2fafd4.zip
1 files changed, 111 insertions, 78 deletions
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py
index 1d7eec7..da7ed97 100755
--- a/grobid_tei_xml/parse.py
+++ b/grobid_tei_xml/parse.py
@@ -25,63 +25,60 @@ def _string_to_tree(content: AnyStr) -> ET.ElementTree:
         raise TypeError(f"expected XML as string or bytes, got: {type(content)}")
 
 
-def _parse_authors(elem: Optional[ET.Element], ns: str = ns) -> List[GrobidAuthor]:
+def _parse_author(elem: Optional[ET.Element], ns: str = ns) -> Optional[GrobidAuthor]:
     """
     Internal helper to parse one or more TEI 'author' XML tags into
     GrobidAuthor objects. 'author' could appear in document headers or
     citations.
     """
-    if not elem:
-        return []
 
-    authors = []
-    for author_tag in elem.findall(f".//{{{ns}}}author"):
-        persname_tag = author_tag.find(f"./{{{ns}}}persName")
-        if persname_tag is None:
-            # should we do something else here? it is possible to have author
-            # without persName?
-            continue
-
-        # basic author name stuff
-        given_name = persname_tag.findtext(f"./{{{ns}}}forename") or None
-        surname = persname_tag.findtext(f"./{{{ns}}}surname") or None
-        # instead create full_name from all the sub-components of the tag
-        full_name = " ".join([t.strip() for t in persname_tag.itertext() if t.strip()]).strip()
-        ga = GrobidAuthor(
-            full_name=full_name or None,
-            given_name=given_name,
-            surname=surname,
-        )
-
-        # author affiliation
-        affiliation_tag = author_tag.find(f"./{{{ns}}}affiliation")
-        if affiliation_tag is not None:
-            affiliation_dict: Dict[str, Any] = dict()
-            for orgname_tag in affiliation_tag.findall(f"./{{{ns}}}orgName"):
-                orgname_type = orgname_tag.get("type")
-                if orgname_type:
-                    affiliation_dict[orgname_type] = orgname_tag.text or None
-            if affiliation_dict:
-                ga.affiliation = GrobidAffiliation(
-                    institution=affiliation_dict.get('institution'),
-                    department=affiliation_dict.get('department'),
-                    laboratory=affiliation_dict.get('laboratory'),
-                )
-                address_tag = affiliation_tag.find(f"./{{{ns}}}address")
-                if address_tag is not None:
-                    address_dict = dict()
-                    for t in list(address_tag):
-                        address_dict[t.tag.split("}")[-1]] = t.text or None
-                    if address_dict:
-                        ga.affiliation.address = GrobidAddress(
-                            addr_line=address_dict.get('addrLine'),
-                            post_code=address_dict.get('postCode'),
-                            settlement=address_dict.get('settlement'),
-                            country=address_dict.get('country'),
-                        )
-        authors.append(ga)
-
-    return authors
+    if elem is None:
+        return None
+    persname_tag = elem.find(f"./{{{ns}}}persName")
+    if persname_tag is None:
+        # should we do something else here? it is possible to have author
+        # without persName?
+        return None
+
+    # basic author name stuff
+    # instead create full_name from all the sub-components of the tag
+    full_name = " ".join([t.strip() for t in persname_tag.itertext() if t.strip()]).strip()
+    ga = GrobidAuthor(
+        full_name=full_name or None,
+        given_name=persname_tag.findtext(f'./{{{ns}}}forename[@type="first"]'),
+        middle_name=persname_tag.findtext(f'./{{{ns}}}forename[@type="middle"]'),
+        surname=persname_tag.findtext(f"./{{{ns}}}surname"),
+        email=persname_tag.findtext(f"./{{{ns}}}email"),
+        orcid=elem.findtext(f'.//{{{ns}}}idno[@type="ORCID"]'),
+    )
+
+    # author affiliation
+    affiliation_tag = elem.find(f"./{{{ns}}}affiliation")
+    if affiliation_tag is not None:
+        affiliation_dict: Dict[str, Any] = dict()
+        for orgname_tag in affiliation_tag.findall(f"./{{{ns}}}orgName"):
+            orgname_type = orgname_tag.get("type")
+            if orgname_type:
+                affiliation_dict[orgname_type] = orgname_tag.text or None
+        if affiliation_dict:
+            ga.affiliation = GrobidAffiliation(
+                institution=affiliation_dict.get('institution'),
+                department=affiliation_dict.get('department'),
+                laboratory=affiliation_dict.get('laboratory'),
+            )
+            address_tag = affiliation_tag.find(f"./{{{ns}}}address")
+            if address_tag is not None:
+                address_dict = dict()
+                for t in list(address_tag):
+                    address_dict[t.tag.split("}")[-1]] = t.text or None
+                if address_dict:
+                    ga.affiliation.address = GrobidAddress(
+                        addr_line=address_dict.get('addrLine'),
+                        post_code=address_dict.get('postCode'),
+                        settlement=address_dict.get('settlement'),
+                        country=address_dict.get('country'),
+                    )
+    return ga
 
 
 def _clean_url(url: Optional[str]) -> Optional[str]:
@@ -134,45 +131,75 @@ def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio:
     Could be document header or a citation.
     """
 
+    authors = []
+    for ela in elem.findall(f".//{{{ns}}}author"):
+        a = _parse_author(ela, ns=ns)
+        if a is not None:
+            authors.append(a)
+
+    editors = []
+    editor_tags = elem.findall(f'.//{{{ns}}}editor')
+    if not editor_tags:
+        editor_tags = elem.findall(f'.//{{{ns}}}contributor[@role="editor"]')
+    for elt in editor_tags or []:
+        e = _parse_author(elt, ns=ns)
+        if e is not None:
+            editors.append(e)
+
     biblio = GrobidBiblio(
-        authors=_parse_authors(elem, ns=ns),
-        id=elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") or None,
-        unstructured=elem.findtext(f'.//{{{ns}}}note[@type="raw_reference"]') or None,
+        authors=authors,
+        editors=editors or None,
+        id=elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id"),
+        unstructured=elem.findtext(f'.//{{{ns}}}note[@type="raw_reference"]'),
 
         # date below
-        title=elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") or None,
-        journal=elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") or None,
-        journal_abbrev=None,  # XXX
-        publisher=elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") or None,
-        volume=elem.findtext(f'.//{{{ns}}}biblScope[@unit="volume"]') or None,
-        issue=elem.findtext(f'.//{{{ns}}}biblScope[@unit="issue"]') or None,
+        # titles: @level=a for article, @level=m for manuscrupt (book)
+        title=elem.findtext(f'.//{{{ns}}}title[@type="main"]'),
+        journal=elem.findtext(f'.//{{{ns}}}title[@level="j"]'),
+        journal_abbrev=elem.findtext(f'.//{{{ns}}}title[@level="j"][@type="abbrev"]'),
+        series_title=elem.findtext(f'.//{{{ns}}}title[@level="s"]'),
+        publisher=elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher"),
+        institution=elem.findtext(f".//{{{ns}}}respStmt/{{{ns}}}orgName"),
+        volume=elem.findtext(f'.//{{{ns}}}biblScope[@unit="volume"]'),
+        issue=elem.findtext(f'.//{{{ns}}}biblScope[@unit="issue"]'),
         # pages below
-        # XXX: note
-        doi=elem.findtext(f'.//{{{ns}}}idno[@type="DOI"]') or None,
-        pmid=elem.findtext(f'.//{{{ns}}}idno[@type="PMID"]') or None,
-        pmcid=elem.findtext(f'.//{{{ns}}}idno[@type="PMCID"]') or None,
-        arxiv_id=elem.findtext(f'.//{{{ns}}}idno[@type="arXiv"]') or None,
-        issn=elem.findtext(f'.//{{{ns}}}idno[@type="ISSN"]') or None,
-        eissn=elem.findtext(f'.//{{{ns}}}idno[@type="eISSN"]') or None,
+        doi=elem.findtext(f'.//{{{ns}}}idno[@type="DOI"]'),
+        pmid=elem.findtext(f'.//{{{ns}}}idno[@type="PMID"]'),
+        pmcid=elem.findtext(f'.//{{{ns}}}idno[@type="PMCID"]'),
+        arxiv_id=elem.findtext(f'.//{{{ns}}}idno[@type="arXiv"]'),
+        pii=elem.findtext(f'.//{{{ns}}}idno[@type="PII"]'),
+        ark=elem.findtext(f'.//{{{ns}}}idno[@type="ark"]'),
+        istex_id=elem.findtext(f'.//{{{ns}}}idno[@type="istexId"]'),
+        issn=elem.findtext(f'.//{{{ns}}}idno[@type="ISSN"]'),
+        eissn=elem.findtext(f'.//{{{ns}}}idno[@type="eISSN"]'),
     )
 
+    book_title_tag = elem.find(f'.//{{{ns}}}title[@level="m"]')
+    if book_title_tag is not None and book_title_tag.attrib.get('type') is None:
+        biblio.book_title = book_title_tag.text
+    if biblio.book_title and not biblio.title:
+        biblio.title = biblio.book_title
+
+    note_tag = elem.find(f'.//{{{ns}}}note')
+    if note_tag is not None and note_tag.attrib.get('type') is None:
+        biblio.note = note_tag.text
+
     if not biblio.publisher:
-        biblio.publisher = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") or None
+        biblio.publisher = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher")
 
     date_tag = elem.find(f'.//{{{ns}}}date[@type="published"]')
     if date_tag is not None:
         biblio.date = date_tag.attrib.get("when") or None
 
-    # title stuff is messy in references...
-    if biblio.journal and not biblio.title:
-        biblio.title = biblio.journal
-        biblio.journal = None
-
     if biblio.arxiv_id and biblio.arxiv_id.startswith("arXiv:"):
         biblio.arxiv_id = biblio.arxiv_id[6:]
 
     el = elem.find(f'.//{{{ns}}}biblScope[@unit="page"]')
     if el is not None:
+        if el.attrib.get("from"):
+            biblio.first_page = el.attrib["from"]
+        if el.attrib.get("to"):
+            biblio.last_page = el.attrib["to"]
         if el.attrib.get("from") and el.attrib.get("to"):
             biblio.pages = "{}-{}".format(el.attrib["from"], el.attrib["to"])
         else:
@@ -205,7 +232,7 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
         grobid_version=application_tag.attrib["version"].strip(),
         grobid_timestamp=application_tag.attrib["when"].strip(),
         header=_parse_biblio(header),
-        pdf_md5=header.findtext(f'.//{{{ns}}}idno[@type="MD5"]') or None,
+        pdf_md5=header.findtext(f'.//{{{ns}}}idno[@type="MD5"]'),
     )
 
     refs = []
@@ -217,17 +244,23 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
 
     text = tei.find(f".//{{{ns}}}text")
     # print(text.attrib)
+
     if text and text.attrib.get(f"{{{xml_ns}}}lang"):
+        # this is the 'body' language
         doc.language_code = text.attrib[f"{{{xml_ns}}}lang"]  # xml:lang
 
     el = tei.find(f".//{{{ns}}}profileDesc/{{{ns}}}abstract")
-    doc.abstract = (el or None) and " ".join(el.itertext()).strip()
+    if el is not None:
+        doc.abstract = " ".join(el.itertext()).strip() or None
     el = tei.find(f".//{{{ns}}}text/{{{ns}}}body")
-    doc.body = (el or None) and " ".join(el.itertext()).strip()
+    if el is not None:
+        doc.body = " ".join(el.itertext()).strip() or None
     el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="acknowledgement"]')
-    doc.acknowledgement = (el or None) and " ".join(el.itertext()).strip()
+    if el is not None:
+        doc.acknowledgement = " ".join(el.itertext()).strip() or None
     el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="annex"]')
-    doc.annex = (el or None) and " ".join(el.itertext()).strip()
+    if el is not None:
+        doc.annex = " ".join(el.itertext()).strip() or None
 
     return doc
author	Bryan Newbold <bnewbold@archive.org>	2021-10-25 15:46:33 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-10-25 15:46:38 -0700
commit	baa6356b80b1a826eca77f74cc487d947d2fafd4 (patch)
tree	85032c05af7561f2147358e12b8bf2f11136c2d6 /grobid_tei_xml/parse.py
parent	09668907c81492774986e11f0acd9b06090dfbe0 (diff)
download	grobid_tei_xml-baa6356b80b1a826eca77f74cc487d947d2fafd4.tar.gz grobid_tei_xml-baa6356b80b1a826eca77f74cc487d947d2fafd4.zip