schema expansion; grobid v0.7.x examples and test coverage

author: Bryan Newbold <bnewbold@archive.org> 2021-10-25 15:46:33 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-10-25 15:46:38 -0700
commit: baa6356b80b1a826eca77f74cc487d947d2fafd4 (patch)
tree: 85032c05af7561f2147358e12b8bf2f11136c2d6 /grobid_tei_xml
parent: 09668907c81492774986e11f0acd9b06090dfbe0 (diff)
download: grobid_tei_xml-baa6356b80b1a826eca77f74cc487d947d2fafd4.tar.gz
grobid_tei_xml-baa6356b80b1a826eca77f74cc487d947d2fafd4.zip
2 files changed, 173 insertions, 111 deletions
diff --git a/grobid_tei_xml/parse.py b/grobid_tei_xml/parse.py
index 1d7eec7..da7ed97 100755
--- a/grobid_tei_xml/parse.py
+++ b/grobid_tei_xml/parse.py
@@ -25,63 +25,60 @@ def _string_to_tree(content: AnyStr) -> ET.ElementTree:
         raise TypeError(f"expected XML as string or bytes, got: {type(content)}")
 
 
-def _parse_authors(elem: Optional[ET.Element], ns: str = ns) -> List[GrobidAuthor]:
+def _parse_author(elem: Optional[ET.Element], ns: str = ns) -> Optional[GrobidAuthor]:
     """
     Internal helper to parse one or more TEI 'author' XML tags into
     GrobidAuthor objects. 'author' could appear in document headers or
     citations.
     """
-    if not elem:
-        return []
 
-    authors = []
-    for author_tag in elem.findall(f".//{{{ns}}}author"):
-        persname_tag = author_tag.find(f"./{{{ns}}}persName")
-        if persname_tag is None:
-            # should we do something else here? it is possible to have author
-            # without persName?
-            continue
-
-        # basic author name stuff
-        given_name = persname_tag.findtext(f"./{{{ns}}}forename") or None
-        surname = persname_tag.findtext(f"./{{{ns}}}surname") or None
-        # instead create full_name from all the sub-components of the tag
-        full_name = " ".join([t.strip() for t in persname_tag.itertext() if t.strip()]).strip()
-        ga = GrobidAuthor(
-            full_name=full_name or None,
-            given_name=given_name,
-            surname=surname,
-        )
-
-        # author affiliation
-        affiliation_tag = author_tag.find(f"./{{{ns}}}affiliation")
-        if affiliation_tag is not None:
-            affiliation_dict: Dict[str, Any] = dict()
-            for orgname_tag in affiliation_tag.findall(f"./{{{ns}}}orgName"):
-                orgname_type = orgname_tag.get("type")
-                if orgname_type:
-                    affiliation_dict[orgname_type] = orgname_tag.text or None
-            if affiliation_dict:
-                ga.affiliation = GrobidAffiliation(
-                    institution=affiliation_dict.get('institution'),
-                    department=affiliation_dict.get('department'),
-                    laboratory=affiliation_dict.get('laboratory'),
-                )
-                address_tag = affiliation_tag.find(f"./{{{ns}}}address")
-                if address_tag is not None:
-                    address_dict = dict()
-                    for t in list(address_tag):
-                        address_dict[t.tag.split("}")[-1]] = t.text or None
-                    if address_dict:
-                        ga.affiliation.address = GrobidAddress(
-                            addr_line=address_dict.get('addrLine'),
-                            post_code=address_dict.get('postCode'),
-                            settlement=address_dict.get('settlement'),
-                            country=address_dict.get('country'),
-                        )
-        authors.append(ga)
-
-    return authors
+    if elem is None:
+        return None
+    persname_tag = elem.find(f"./{{{ns}}}persName")
+    if persname_tag is None:
+        # should we do something else here? it is possible to have author
+        # without persName?
+        return None
+
+    # basic author name stuff
+    # instead create full_name from all the sub-components of the tag
+    full_name = " ".join([t.strip() for t in persname_tag.itertext() if t.strip()]).strip()
+    ga = GrobidAuthor(
+        full_name=full_name or None,
+        given_name=persname_tag.findtext(f'./{{{ns}}}forename[@type="first"]'),
+        middle_name=persname_tag.findtext(f'./{{{ns}}}forename[@type="middle"]'),
+        surname=persname_tag.findtext(f"./{{{ns}}}surname"),
+        email=persname_tag.findtext(f"./{{{ns}}}email"),
+        orcid=elem.findtext(f'.//{{{ns}}}idno[@type="ORCID"]'),
+    )
+
+    # author affiliation
+    affiliation_tag = elem.find(f"./{{{ns}}}affiliation")
+    if affiliation_tag is not None:
+        affiliation_dict: Dict[str, Any] = dict()
+        for orgname_tag in affiliation_tag.findall(f"./{{{ns}}}orgName"):
+            orgname_type = orgname_tag.get("type")
+            if orgname_type:
+                affiliation_dict[orgname_type] = orgname_tag.text or None
+        if affiliation_dict:
+            ga.affiliation = GrobidAffiliation(
+                institution=affiliation_dict.get('institution'),
+                department=affiliation_dict.get('department'),
+                laboratory=affiliation_dict.get('laboratory'),
+            )
+            address_tag = affiliation_tag.find(f"./{{{ns}}}address")
+            if address_tag is not None:
+                address_dict = dict()
+                for t in list(address_tag):
+                    address_dict[t.tag.split("}")[-1]] = t.text or None
+                if address_dict:
+                    ga.affiliation.address = GrobidAddress(
+                        addr_line=address_dict.get('addrLine'),
+                        post_code=address_dict.get('postCode'),
+                        settlement=address_dict.get('settlement'),
+                        country=address_dict.get('country'),
+                    )
+    return ga
 
 
 def _clean_url(url: Optional[str]) -> Optional[str]:
@@ -134,45 +131,75 @@ def _parse_biblio(elem: ET.Element, ns: str = ns) -> GrobidBiblio:
     Could be document header or a citation.
     """
 
+    authors = []
+    for ela in elem.findall(f".//{{{ns}}}author"):
+        a = _parse_author(ela, ns=ns)
+        if a is not None:
+            authors.append(a)
+
+    editors = []
+    editor_tags = elem.findall(f'.//{{{ns}}}editor')
+    if not editor_tags:
+        editor_tags = elem.findall(f'.//{{{ns}}}contributor[@role="editor"]')
+    for elt in editor_tags or []:
+        e = _parse_author(elt, ns=ns)
+        if e is not None:
+            editors.append(e)
+
     biblio = GrobidBiblio(
-        authors=_parse_authors(elem, ns=ns),
-        id=elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") or None,
-        unstructured=elem.findtext(f'.//{{{ns}}}note[@type="raw_reference"]') or None,
+        authors=authors,
+        editors=editors or None,
+        id=elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id"),
+        unstructured=elem.findtext(f'.//{{{ns}}}note[@type="raw_reference"]'),
 
         # date below
-        title=elem.findtext(f".//{{{ns}}}analytic/{{{ns}}}title") or None,
-        journal=elem.findtext(f".//{{{ns}}}monogr/{{{ns}}}title") or None,
-        journal_abbrev=None,  # XXX
-        publisher=elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher") or None,
-        volume=elem.findtext(f'.//{{{ns}}}biblScope[@unit="volume"]') or None,
-        issue=elem.findtext(f'.//{{{ns}}}biblScope[@unit="issue"]') or None,
+        # titles: @level=a for article, @level=m for manuscrupt (book)
+        title=elem.findtext(f'.//{{{ns}}}title[@type="main"]'),
+        journal=elem.findtext(f'.//{{{ns}}}title[@level="j"]'),
+        journal_abbrev=elem.findtext(f'.//{{{ns}}}title[@level="j"][@type="abbrev"]'),
+        series_title=elem.findtext(f'.//{{{ns}}}title[@level="s"]'),
+        publisher=elem.findtext(f".//{{{ns}}}publicationStmt/{{{ns}}}publisher"),
+        institution=elem.findtext(f".//{{{ns}}}respStmt/{{{ns}}}orgName"),
+        volume=elem.findtext(f'.//{{{ns}}}biblScope[@unit="volume"]'),
+        issue=elem.findtext(f'.//{{{ns}}}biblScope[@unit="issue"]'),
         # pages below
-        # XXX: note
-        doi=elem.findtext(f'.//{{{ns}}}idno[@type="DOI"]') or None,
-        pmid=elem.findtext(f'.//{{{ns}}}idno[@type="PMID"]') or None,
-        pmcid=elem.findtext(f'.//{{{ns}}}idno[@type="PMCID"]') or None,
-        arxiv_id=elem.findtext(f'.//{{{ns}}}idno[@type="arXiv"]') or None,
-        issn=elem.findtext(f'.//{{{ns}}}idno[@type="ISSN"]') or None,
-        eissn=elem.findtext(f'.//{{{ns}}}idno[@type="eISSN"]') or None,
+        doi=elem.findtext(f'.//{{{ns}}}idno[@type="DOI"]'),
+        pmid=elem.findtext(f'.//{{{ns}}}idno[@type="PMID"]'),
+        pmcid=elem.findtext(f'.//{{{ns}}}idno[@type="PMCID"]'),
+        arxiv_id=elem.findtext(f'.//{{{ns}}}idno[@type="arXiv"]'),
+        pii=elem.findtext(f'.//{{{ns}}}idno[@type="PII"]'),
+        ark=elem.findtext(f'.//{{{ns}}}idno[@type="ark"]'),
+        istex_id=elem.findtext(f'.//{{{ns}}}idno[@type="istexId"]'),
+        issn=elem.findtext(f'.//{{{ns}}}idno[@type="ISSN"]'),
+        eissn=elem.findtext(f'.//{{{ns}}}idno[@type="eISSN"]'),
     )
 
+    book_title_tag = elem.find(f'.//{{{ns}}}title[@level="m"]')
+    if book_title_tag is not None and book_title_tag.attrib.get('type') is None:
+        biblio.book_title = book_title_tag.text
+    if biblio.book_title and not biblio.title:
+        biblio.title = biblio.book_title
+
+    note_tag = elem.find(f'.//{{{ns}}}note')
+    if note_tag is not None and note_tag.attrib.get('type') is None:
+        biblio.note = note_tag.text
+
     if not biblio.publisher:
-        biblio.publisher = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher") or None
+        biblio.publisher = elem.findtext(f".//{{{ns}}}imprint/{{{ns}}}publisher")
 
     date_tag = elem.find(f'.//{{{ns}}}date[@type="published"]')
     if date_tag is not None:
         biblio.date = date_tag.attrib.get("when") or None
 
-    # title stuff is messy in references...
-    if biblio.journal and not biblio.title:
-        biblio.title = biblio.journal
-        biblio.journal = None
-
     if biblio.arxiv_id and biblio.arxiv_id.startswith("arXiv:"):
         biblio.arxiv_id = biblio.arxiv_id[6:]
 
     el = elem.find(f'.//{{{ns}}}biblScope[@unit="page"]')
     if el is not None:
+        if el.attrib.get("from"):
+            biblio.first_page = el.attrib["from"]
+        if el.attrib.get("to"):
+            biblio.last_page = el.attrib["to"]
         if el.attrib.get("from") and el.attrib.get("to"):
             biblio.pages = "{}-{}".format(el.attrib["from"], el.attrib["to"])
         else:
@@ -205,7 +232,7 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
         grobid_version=application_tag.attrib["version"].strip(),
         grobid_timestamp=application_tag.attrib["when"].strip(),
         header=_parse_biblio(header),
-        pdf_md5=header.findtext(f'.//{{{ns}}}idno[@type="MD5"]') or None,
+        pdf_md5=header.findtext(f'.//{{{ns}}}idno[@type="MD5"]'),
     )
 
     refs = []
@@ -217,17 +244,23 @@ def parse_document_xml(xml_text: AnyStr) -> GrobidDocument:
 
     text = tei.find(f".//{{{ns}}}text")
     # print(text.attrib)
+
     if text and text.attrib.get(f"{{{xml_ns}}}lang"):
+        # this is the 'body' language
         doc.language_code = text.attrib[f"{{{xml_ns}}}lang"]  # xml:lang
 
     el = tei.find(f".//{{{ns}}}profileDesc/{{{ns}}}abstract")
-    doc.abstract = (el or None) and " ".join(el.itertext()).strip()
+    if el is not None:
+        doc.abstract = " ".join(el.itertext()).strip() or None
     el = tei.find(f".//{{{ns}}}text/{{{ns}}}body")
-    doc.body = (el or None) and " ".join(el.itertext()).strip()
+    if el is not None:
+        doc.body = " ".join(el.itertext()).strip() or None
     el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="acknowledgement"]')
-    doc.acknowledgement = (el or None) and " ".join(el.itertext()).strip()
+    if el is not None:
+        doc.acknowledgement = " ".join(el.itertext()).strip() or None
     el = tei.find(f'.//{{{ns}}}back/{{{ns}}}div[@type="annex"]')
-    doc.annex = (el or None) and " ".join(el.itertext()).strip()
+    if el is not None:
+        doc.annex = " ".join(el.itertext()).strip() or None
 
     return doc
 
diff --git a/grobid_tei_xml/types.py b/grobid_tei_xml/types.py
index 8356c8e..252e677 100644
--- a/grobid_tei_xml/types.py
+++ b/grobid_tei_xml/types.py
@@ -8,7 +8,6 @@ class GrobidAddress:
     post_code: Optional[str] = None
     settlement: Optional[str] = None
     country: Optional[str] = None
-    country_code: Optional[str] = None  # XXX
 
 
 @dataclass
@@ -23,17 +22,16 @@ class GrobidAffiliation:
 class GrobidAuthor:
     full_name: Optional[str]
     given_name: Optional[str] = None
-    middle: Optional[str] = None  # XXX
+    middle_name: Optional[str] = None
     surname: Optional[str] = None
-    suffix: Optional[str] = None  # XXX
     email: Optional[str] = None  # XXX
+    orcid: Optional[str] = None  # XXX
     affiliation: Optional[GrobidAffiliation] = None
 
     def to_csl_dict(self) -> dict:
         d = dict(
-            given=self.given_name,
+            given=self.given_name or self.middle_name,
             family=self.surname,
-            suffix=self.suffix,
         )
         return _simplify_dict(d)
 
@@ -79,28 +77,64 @@ class GrobidBiblio:
 
     date: Optional[str] = None
     title: Optional[str] = None
-    journal: Optional[str] = None  # XXX: venue? other?
+    book_title: Optional[str] = None
+    series_title: Optional[str] = None
+    editors: Optional[List[GrobidAuthor]] = None
+    journal: Optional[str] = None
     journal_abbrev: Optional[str] = None
     publisher: Optional[str] = None
+    institution: Optional[str] = None
     issn: Optional[str] = None
     eissn: Optional[str] = None
     volume: Optional[str] = None
     issue: Optional[str] = None
     pages: Optional[str] = None
-    first_page: Optional[str] = None  # XXX
-    last_page: Optional[str] = None  # XXX
+    first_page: Optional[str] = None
+    last_page: Optional[str] = None
     note: Optional[str] = None
 
     doi: Optional[str] = None
     pmid: Optional[str] = None
     pmcid: Optional[str] = None
     arxiv_id: Optional[str] = None
+    pii: Optional[str] = None
+    ark: Optional[str] = None
+    istex_id: Optional[str] = None
     url: Optional[str] = None
-    oa_url: Optional[str] = None
 
     def to_dict(self) -> dict:
         return _simplify_dict(asdict(self))
 
+    def to_legacy_dict(self) -> dict:
+        """
+        Returns a dict in the old "grobid2json" format.
+        """
+        d = self.to_dict()
+
+        # new keys
+        d.pop('first_page', None)
+        d.pop('last_page', None)
+        d.pop('note', None)
+
+        # legacy book title behavior
+        if not d.get('journal') and d.get('book_title'):
+            d['journal'] = d.pop('book_title')
+        else:
+            d.pop('book_title', None)
+
+        # author changes
+        for a in d['authors']:
+            a['name'] = a.pop('full_name', None)
+            if not a.get('given_name'):
+                a['given_name'] = a.pop('middle_name', None)
+            else:
+                a.pop('middle_name', None)
+            addr = a.get('affiliation', {}).get('address')
+            if addr and addr.get('post_code'):
+                addr['postCode'] = addr.pop('post_code')
+
+        return _simplify_dict(d)
+
     def to_csl_dict(self, default_type: str = "article-journal") -> dict:
         """
         Transforms in to Citation Style Language (CSL) JSON schema, as a dict
@@ -119,11 +153,14 @@ class GrobidBiblio:
             PMCID=self.pmcid,
             ISSN=self.issn,
             note=self.note,
-            # fields with '-' in the key name
-            **{
-                "container-title": self.journal,
-                "page-first": self.first_page,
-            })
+        )
+        # fields with '-' in the key name
+        csl.update({
+            "container-title": self.journal,
+            "book-title": self.book_title,
+            "series-title": self.series_title,
+            "page-first": self.first_page,
+        })
 
         # numeric fields
         if self.issue and self.issue.isdigit():
@@ -164,32 +201,24 @@ class GrobidDocument:
         Returns a dict in the old "grobid2json" format.
         """
         d = self.to_dict()
+        d.pop('header', None)
+        d.update(self.header.to_legacy_dict())
+        if self.citations:
+            d['citations'] = [c.to_legacy_dict() for c in self.citations]
 
         # all header fields at top-level
-        header = d.pop('header', {})
         d['journal'] = dict(
-            name=header.pop('journal', None),
-            abbrev=header.pop('journal_abbrev', None),
-            publisher=header.pop('publisher', None),
-            issn=header.pop('issn', None),
-            issne=header.pop('issne', None),
+            name=d.pop('journal', None),
+            publisher=d.pop('publisher', None),
+            issn=d.pop('issn', None),
+            issne=d.pop('issne', None),
+            volume=d.pop('volume', None),
+            issue=d.pop('issue', None),
         )
-        d.update(header)
 
-        # files not in the old schema
+        # document fields not in the old schema
         d.pop('pdf_md5', None)
-        for c in d.get('citations', []):
-            c.pop('note', None)
 
-        # author changes
-        for a in d['authors']:
-            a['name'] = a.pop('full_name')
-            addr = a.get('affiliation', {}).get('address')
-            if addr and addr.get('post_code'):
-                addr['postCode'] = addr.pop('post_code')
-        for c in d['citations'] or []:
-            for a in c['authors']:
-                a['name'] = a.pop('full_name')
         return _simplify_dict(d)
 
     def remove_encumbered(self) -> None:
author	Bryan Newbold <bnewbold@archive.org>	2021-10-25 15:46:33 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-10-25 15:46:38 -0700
commit	baa6356b80b1a826eca77f74cc487d947d2fafd4 (patch)
tree	85032c05af7561f2147358e12b8bf2f11136c2d6 /grobid_tei_xml
parent	09668907c81492774986e11f0acd9b06090dfbe0 (diff)
download	grobid_tei_xml-baa6356b80b1a826eca77f74cc487d947d2fafd4.tar.gz grobid_tei_xml-baa6356b80b1a826eca77f74cc487d947d2fafd4.zip