refactor use of grobid_tei_xml

author: Bryan Newbold <bnewbold@archive.org> 2021-10-27 18:24:19 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-10-27 18:25:58 -0700
commit: 560d5f7cc1672f95e2a953ab5908f4205151a703 (patch)
tree: 04b35084358786bbd2329491be07cde35a4d2289
parent: 33211915773a0c77d064c55c1b02ceed6f455feb (diff)
download: fatcat-scholar-560d5f7cc1672f95e2a953ab5908f4205151a703.tar.gz
fatcat-scholar-560d5f7cc1672f95e2a953ab5908f4205151a703.zip
5 files changed, 102 insertions, 75 deletions
diff --git a/fatcat_scholar/query_citation.py b/fatcat_scholar/query_citation.py
index dea4f02..0233acc 100644
--- a/fatcat_scholar/query_citation.py
+++ b/fatcat_scholar/query_citation.py
@@ -11,14 +11,14 @@ parallel with "regular" query?
 """
 
 import sys
-from typing import Any, Optional, Tuple
+from typing import Any, List, Optional, Tuple
 
 import fuzzycat.common
 import fuzzycat.verify
 import requests
 from fatcat_openapi_client import ReleaseContrib, ReleaseEntity, ReleaseExtIds
 from fuzzycat.matching import match_release_fuzzy
-from grobid_tei_xml import parse_citations_xml
+from grobid_tei_xml import GrobidBiblio, parse_citations_xml
 
 from fatcat_scholar.api_entities import entity_to_dict
 
@@ -44,44 +44,42 @@ def grobid_process_citation(
     return grobid_response.text
 
 
-def transform_grobid(raw_xml: str) -> Optional[dict]:
-    ref_list = parse_citations_xml(raw_xml)
-    if not ref_list:
+def transform_grobid(raw_xml: str) -> Optional[GrobidBiblio]:
+    ref_list: List[GrobidBiblio] = parse_citations_xml(raw_xml)
+    # check for unmatched or empty references
+    if not ref_list or not ref_list[0].to_dict():
         return None
-    ref = ref_list[0]
-    if not any(ref.values()):
-        return None
-    return ref
+    return ref_list[0]
 
 
-def ref_to_release(ref: dict) -> ReleaseEntity:
+def ref_to_release(ref: GrobidBiblio) -> ReleaseEntity:
     contribs = []
-    for author in ref.get("authors") or []:
+    for author in ref.authors or []:
         contribs.append(
             ReleaseContrib(
-                raw_name=author.get("name"),
-                given_name=author.get("given_name"),
-                surname=author.get("surname"),
+                raw_name=author.full_name,
+                given_name=author.given_name,
+                surname=author.surname,
             )
         )
     release = ReleaseEntity(
-        title=ref.get("title"),
+        title=ref.title,
         contribs=contribs,
-        volume=ref.get("volume"),
-        issue=ref.get("issue"),
-        pages=ref.get("pages"),
+        volume=ref.volume,
+        issue=ref.issue,
+        pages=ref.pages,
         ext_ids=ReleaseExtIds(
-            doi=ref.get("doi"),
-            pmid=ref.get("pmid"),
-            pmcid=ref.get("pmcid"),
-            arxiv=ref.get("arxiv_id"),
+            doi=ref.doi,
+            pmid=ref.pmid,
+            pmcid=ref.pmcid,
+            arxiv=ref.arxiv_id,
         ),
     )
-    if ref.get("journal"):
-        release.extra = {"container_name": ref.get("journal")}
-    if ref.get("date"):
-        if len(ref["date"]) == 4 and ref["date"].isdigit():
-            release.release_year = int(ref["date"])
+    if ref.journal:
+        release.extra = {"container_name": ref.journal}
+    if ref.date:
+        if len(ref.date) == 4 and ref.date.isdigit():
+            release.release_year = int(ref.date)
     return release
 
 
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index 6c9307d..b170f12 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -13,6 +13,7 @@ from typing import Any, Dict, List, Optional
 import ftfy
 from bs4 import BeautifulSoup
 from fatcat_openapi_client import ReleaseContrib, ReleaseEntity
+from grobid_tei_xml import GrobidDocument
 
 # pytype: disable=import-error
 from pydantic import BaseModel
@@ -494,12 +495,12 @@ def contrib_affiliation(contrib: ReleaseContrib) -> Optional[str]:
     return None
 
 
-def es_abstracts_from_grobid(tei_dict: dict) -> List[ScholarAbstract]:
+def es_abstracts_from_grobid(tei_doc: GrobidDocument) -> List[ScholarAbstract]:
 
-    if tei_dict.get("abstract"):
-        body = scrub_text(tei_dict["abstract"])
+    if tei_doc.abstract:
+        body = scrub_text(tei_doc.abstract)
         if body:
-            return [ScholarAbstract(lang_code=tei_dict.get("lang"), body=body)]
+            return [ScholarAbstract(lang_code=tei_doc.language_code, body=body)]
     return []
 
 
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index caeff21..d40e123 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -8,7 +8,7 @@ from typing import Any, Dict, List, Optional, Sequence
 
 import sentry_sdk
 from fatcat_openapi_client import FileEntity, ReleaseEntity, WebcaptureEntity
-from grobid_tei_xml import parse_document_xml
+from grobid_tei_xml import GrobidBiblio, GrobidDocument, parse_document_xml
 
 from fatcat_scholar.config import GIT_REVISION, settings
 from fatcat_scholar.identifiers import clean_doi, clean_pmcid
@@ -241,18 +241,18 @@ def _add_file_release_meta(
 
 
 def es_fulltext_from_grobid(
-    tei_dict: dict, pdf_meta: Optional[dict], re: ReleaseEntity, fe: FileEntity
+    tei_doc: GrobidDocument, pdf_meta: Optional[dict], re: ReleaseEntity, fe: FileEntity
 ) -> Optional[ScholarFulltext]:
-    if not tei_dict.get("body"):
+    if not tei_doc.body:
         return None
-    body = tei_dict.get("body")
+    body = tei_doc.body
     if body and len(body) > MAX_BODY_CHARS:
         body = body[:MAX_BODY_CHARS]
     ret = ScholarFulltext(
-        lang_code=tei_dict.get("lang"),
+        lang_code=tei_doc.language_code,
         body=body,
-        acknowledgement=tei_dict.get("acknowledgement"),
-        annex=tei_dict.get("annex"),
+        acknowledgement=tei_doc.acknowledgement,
+        annex=tei_doc.annex,
     )
     return _add_file_release_meta(ret, pdf_meta, re, fe)
 
@@ -521,15 +521,16 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
             if f.ident == heavy.grobid_fulltext["file_ident"]
         ][0]
         try:
-            tei_doc = parse_document_xml(heavy.grobid_fulltext["tei_xml"])
-            tei_dict = tei_doc.to_legacy_dict()
+            tei_doc: Optional[GrobidDocument] = parse_document_xml(
+                heavy.grobid_fulltext["tei_xml"]
+            )
         except xml.etree.ElementTree.ParseError:
-            tei_dict = None
-        if tei_dict:
+            tei_doc = None
+        if tei_doc:
             if not abstracts:
-                abstracts = es_abstracts_from_grobid(tei_dict)
+                abstracts = es_abstracts_from_grobid(tei_doc)
             grobid_fulltext = es_fulltext_from_grobid(
-                tei_dict, heavy.pdf_meta, fulltext_release, fulltext_file
+                tei_doc, heavy.pdf_meta, fulltext_release, fulltext_file
             )
             if exclude_web_fulltext and grobid_fulltext:
                 if not fulltext:
@@ -681,52 +682,50 @@ def test_clean_ref_key() -> None:
         assert clean_ref_key(raw, doi=doi) == expected
 
 
-def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> List[RefStructured]:
+def refs_from_grobid(
+    release: ReleaseEntity, tei_doc: GrobidDocument
+) -> List[RefStructured]:
     output = []
-    for ref in tei_dict.get("citations") or []:
-        ref_date = ref.get("date") or None
+    ref: GrobidBiblio
+    for ref in tei_doc.citations or []:
+        ref_date = ref.date or None
         ref_year: Optional[int] = None
         if ref_date and len(ref_date) >= 4 and ref_date[:4].isdigit():
             ref_year = int(ref_date[:4])
-        ref_authors = ref.get("authors") or []
         authors: List[str] = []
-        for a in ref_authors:
-            if isinstance(a, str):
-                authors.append(a)
-            elif isinstance(a, dict):
-                if a.get("name"):
-                    assert isinstance(a["name"], str)
-                    authors.append(a["name"])
-        ref_index = ref.get("index")
+        for a in ref.authors or []:
+            if a.full_name:
+                assert isinstance(a.full_name, str)
+                authors.append(a.full_name)
+        ref_index = ref.index
         if ref_index is not None:
             # transform from 0-indexed to 1-indexed
             ref_index = ref_index + 1
         output.append(
             RefStructured(
                 biblio=RefBiblio(
-                    unstructured=ref.get("unstructured"),
-                    title=ref.get("title"),
+                    unstructured=ref.unstructured,
+                    title=ref.title,
                     # subtitle
                     contrib_raw_names=authors or None,
                     year=ref_year,
-                    container_name=ref.get("journal"),
-                    publisher=ref.get("publisher"),
-                    volume=ref.get("volume"),
-                    issue=ref.get("issue"),
-                    pages=ref.get("pages"),
-                    doi=clean_doi(ref.get("doi")),
-                    pmid=ref.get("pmid"),
-                    pmcid=clean_pmcid(ref.get("pmcid")),
-                    arxiv_id=ref.get("arxiv_id"),
-                    isbn=ref.get("isbn"),
-                    url=clean_url_conservative(ref.get("url")),
+                    container_name=ref.journal,
+                    publisher=ref.publisher,
+                    volume=ref.volume,
+                    issue=ref.issue,
+                    pages=ref.pages,
+                    doi=clean_doi(ref.doi),
+                    pmid=ref.pmid,
+                    pmcid=clean_pmcid(ref.pmcid),
+                    arxiv_id=ref.arxiv_id,
+                    url=clean_url_conservative(ref.url),
                 ),
                 release_ident=release.ident,
                 work_ident=release.work_id,
                 release_stage=release.release_stage,
                 release_year=release.release_year,
                 index=ref_index,
-                key=clean_ref_key(ref.get("id")),
+                key=clean_ref_key(ref.id),
                 locator=None,
                 # target_release_id
                 ref_source="grobid",
@@ -902,8 +901,7 @@ def refs_from_heavy(heavy: IntermediateBundle) -> Sequence[RefStructured]:
             if r.ident == heavy.grobid_fulltext["release_ident"]
         ][0]
         tei_doc = parse_document_xml(heavy.grobid_fulltext["tei_xml"])
-        tei_dict = tei_doc.to_legacy_dict()
-        fulltext_refs = refs_from_grobid(fulltext_release, tei_dict)
+        fulltext_refs = refs_from_grobid(fulltext_release, tei_doc)
 
     crossref_refs: List[RefStructured] = []
     if heavy.crossref:
diff --git a/tests/test_grobid2json.py b/tests/test_grobid_parse.py
index adf36a1..c0adf9b 100644
--- a/tests/test_grobid2json.py
+++ b/tests/test_grobid_parse.py
@@ -1,7 +1,7 @@
 from grobid_tei_xml import parse_document_xml
 
 
-def test_grobid_parse() -> None:
+def test_grobid_parse_legacy() -> None:
     """
     This function formerly tested the grobid2json file in this project. Now it
     tests backwards-compatibility of the grobid_tei_xml library.
@@ -29,3 +29,34 @@ def test_grobid_parse() -> None:
         ref["unstructured"]
         == "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19."
     )
+
+
+def test_grobid_parse() -> None:
+    """
+    Equivalent to test_grobid_parse_legacy(), but using the GrobidDocument type directly
+    """
+
+    with open("tests/files/example_grobid.tei.xml", "r") as f:
+        blob = f.read()
+
+    doc = parse_document_xml(blob)
+
+    assert (
+        doc.header.title
+        == "Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network"
+    )
+
+    assert doc.citations is not None
+    ref = [c for c in doc.citations if c.id == "b12"][0]
+    assert ref.authors[0].given_name == "K"
+    assert ref.authors[0].full_name == "K Tasa"
+    assert ref.authors[0].surname == "Tasa"
+    assert ref.journal == "Quality Management in Health Care"
+    assert ref.title == "Using patient feedback for quality improvement"
+    assert ref.date == "1996"
+    assert ref.pages == "206-225"
+    assert ref.volume == "8"
+    assert (
+        ref.unstructured
+        == "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19."
+    )
diff --git a/tests/test_refs_transform.py b/tests/test_refs_transform.py
index 2fc210f..4b1b588 100644
--- a/tests/test_refs_transform.py
+++ b/tests/test_refs_transform.py
@@ -20,8 +20,7 @@ def test_transform_refs_grobid() -> None:
     )
 
     tei_doc = parse_document_xml(blob)
-    tei_dict = tei_doc.to_legacy_dict()
-    refs = refs_from_grobid(dummy_release, tei_dict)
+    refs = refs_from_grobid(dummy_release, tei_doc)
 
     ref = refs[12]
     assert ref.release_ident == "releasedummy22222222222222"
author	Bryan Newbold <bnewbold@archive.org>	2021-10-27 18:24:19 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-10-27 18:25:58 -0700
commit	560d5f7cc1672f95e2a953ab5908f4205151a703 (patch)
tree	04b35084358786bbd2329491be07cde35a4d2289
parent	33211915773a0c77d064c55c1b02ceed6f455feb (diff)
download	fatcat-scholar-560d5f7cc1672f95e2a953ab5908f4205151a703.tar.gz fatcat-scholar-560d5f7cc1672f95e2a953ab5908f4205151a703.zip