diff options
| -rw-r--r-- | fatcat_scholar/query_citation.py | 52 | ||||
| -rw-r--r-- | fatcat_scholar/schema.py | 9 | ||||
| -rw-r--r-- | fatcat_scholar/transform.py | 80 | ||||
| -rw-r--r-- | tests/test_grobid_parse.py (renamed from tests/test_grobid2json.py) | 33 | ||||
| -rw-r--r-- | tests/test_refs_transform.py | 3 | 
5 files changed, 102 insertions, 75 deletions
| diff --git a/fatcat_scholar/query_citation.py b/fatcat_scholar/query_citation.py index dea4f02..0233acc 100644 --- a/fatcat_scholar/query_citation.py +++ b/fatcat_scholar/query_citation.py @@ -11,14 +11,14 @@ parallel with "regular" query?  """  import sys -from typing import Any, Optional, Tuple +from typing import Any, List, Optional, Tuple  import fuzzycat.common  import fuzzycat.verify  import requests  from fatcat_openapi_client import ReleaseContrib, ReleaseEntity, ReleaseExtIds  from fuzzycat.matching import match_release_fuzzy -from grobid_tei_xml import parse_citations_xml +from grobid_tei_xml import GrobidBiblio, parse_citations_xml  from fatcat_scholar.api_entities import entity_to_dict @@ -44,44 +44,42 @@ def grobid_process_citation(      return grobid_response.text -def transform_grobid(raw_xml: str) -> Optional[dict]: -    ref_list = parse_citations_xml(raw_xml) -    if not ref_list: +def transform_grobid(raw_xml: str) -> Optional[GrobidBiblio]: +    ref_list: List[GrobidBiblio] = parse_citations_xml(raw_xml) +    # check for unmatched or empty references +    if not ref_list or not ref_list[0].to_dict():          return None -    ref = ref_list[0] -    if not any(ref.values()): -        return None -    return ref +    return ref_list[0] -def ref_to_release(ref: dict) -> ReleaseEntity: +def ref_to_release(ref: GrobidBiblio) -> ReleaseEntity:      contribs = [] -    for author in ref.get("authors") or []: +    for author in ref.authors or []:          contribs.append(              ReleaseContrib( -                raw_name=author.get("name"), -                given_name=author.get("given_name"), -                surname=author.get("surname"), +                raw_name=author.full_name, +                given_name=author.given_name, +                surname=author.surname,              )          )      release = ReleaseEntity( -        title=ref.get("title"), +        title=ref.title,          contribs=contribs, -        volume=ref.get("volume"), -        issue=ref.get("issue"), -        pages=ref.get("pages"), +        volume=ref.volume, +        issue=ref.issue, +        pages=ref.pages,          ext_ids=ReleaseExtIds( -            doi=ref.get("doi"), -            pmid=ref.get("pmid"), -            pmcid=ref.get("pmcid"), -            arxiv=ref.get("arxiv_id"), +            doi=ref.doi, +            pmid=ref.pmid, +            pmcid=ref.pmcid, +            arxiv=ref.arxiv_id,          ),      ) -    if ref.get("journal"): -        release.extra = {"container_name": ref.get("journal")} -    if ref.get("date"): -        if len(ref["date"]) == 4 and ref["date"].isdigit(): -            release.release_year = int(ref["date"]) +    if ref.journal: +        release.extra = {"container_name": ref.journal} +    if ref.date: +        if len(ref.date) == 4 and ref.date.isdigit(): +            release.release_year = int(ref.date)      return release diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index 6c9307d..b170f12 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -13,6 +13,7 @@ from typing import Any, Dict, List, Optional  import ftfy  from bs4 import BeautifulSoup  from fatcat_openapi_client import ReleaseContrib, ReleaseEntity +from grobid_tei_xml import GrobidDocument  # pytype: disable=import-error  from pydantic import BaseModel @@ -494,12 +495,12 @@ def contrib_affiliation(contrib: ReleaseContrib) -> Optional[str]:      return None -def es_abstracts_from_grobid(tei_dict: dict) -> List[ScholarAbstract]: +def es_abstracts_from_grobid(tei_doc: GrobidDocument) -> List[ScholarAbstract]: -    if tei_dict.get("abstract"): -        body = scrub_text(tei_dict["abstract"]) +    if tei_doc.abstract: +        body = scrub_text(tei_doc.abstract)          if body: -            return [ScholarAbstract(lang_code=tei_dict.get("lang"), body=body)] +            return [ScholarAbstract(lang_code=tei_doc.language_code, body=body)]      return [] diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index caeff21..d40e123 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -8,7 +8,7 @@ from typing import Any, Dict, List, Optional, Sequence  import sentry_sdk  from fatcat_openapi_client import FileEntity, ReleaseEntity, WebcaptureEntity -from grobid_tei_xml import parse_document_xml +from grobid_tei_xml import GrobidBiblio, GrobidDocument, parse_document_xml  from fatcat_scholar.config import GIT_REVISION, settings  from fatcat_scholar.identifiers import clean_doi, clean_pmcid @@ -241,18 +241,18 @@ def _add_file_release_meta(  def es_fulltext_from_grobid( -    tei_dict: dict, pdf_meta: Optional[dict], re: ReleaseEntity, fe: FileEntity +    tei_doc: GrobidDocument, pdf_meta: Optional[dict], re: ReleaseEntity, fe: FileEntity  ) -> Optional[ScholarFulltext]: -    if not tei_dict.get("body"): +    if not tei_doc.body:          return None -    body = tei_dict.get("body") +    body = tei_doc.body      if body and len(body) > MAX_BODY_CHARS:          body = body[:MAX_BODY_CHARS]      ret = ScholarFulltext( -        lang_code=tei_dict.get("lang"), +        lang_code=tei_doc.language_code,          body=body, -        acknowledgement=tei_dict.get("acknowledgement"), -        annex=tei_dict.get("annex"), +        acknowledgement=tei_doc.acknowledgement, +        annex=tei_doc.annex,      )      return _add_file_release_meta(ret, pdf_meta, re, fe) @@ -521,15 +521,16 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:              if f.ident == heavy.grobid_fulltext["file_ident"]          ][0]          try: -            tei_doc = parse_document_xml(heavy.grobid_fulltext["tei_xml"]) -            tei_dict = tei_doc.to_legacy_dict() +            tei_doc: Optional[GrobidDocument] = parse_document_xml( +                heavy.grobid_fulltext["tei_xml"] +            )          except xml.etree.ElementTree.ParseError: -            tei_dict = None -        if tei_dict: +            tei_doc = None +        if tei_doc:              if not abstracts: -                abstracts = es_abstracts_from_grobid(tei_dict) +                abstracts = es_abstracts_from_grobid(tei_doc)              grobid_fulltext = es_fulltext_from_grobid( -                tei_dict, heavy.pdf_meta, fulltext_release, fulltext_file +                tei_doc, heavy.pdf_meta, fulltext_release, fulltext_file              )              if exclude_web_fulltext and grobid_fulltext:                  if not fulltext: @@ -681,52 +682,50 @@ def test_clean_ref_key() -> None:          assert clean_ref_key(raw, doi=doi) == expected -def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> List[RefStructured]: +def refs_from_grobid( +    release: ReleaseEntity, tei_doc: GrobidDocument +) -> List[RefStructured]:      output = [] -    for ref in tei_dict.get("citations") or []: -        ref_date = ref.get("date") or None +    ref: GrobidBiblio +    for ref in tei_doc.citations or []: +        ref_date = ref.date or None          ref_year: Optional[int] = None          if ref_date and len(ref_date) >= 4 and ref_date[:4].isdigit():              ref_year = int(ref_date[:4]) -        ref_authors = ref.get("authors") or []          authors: List[str] = [] -        for a in ref_authors: -            if isinstance(a, str): -                authors.append(a) -            elif isinstance(a, dict): -                if a.get("name"): -                    assert isinstance(a["name"], str) -                    authors.append(a["name"]) -        ref_index = ref.get("index") +        for a in ref.authors or []: +            if a.full_name: +                assert isinstance(a.full_name, str) +                authors.append(a.full_name) +        ref_index = ref.index          if ref_index is not None:              # transform from 0-indexed to 1-indexed              ref_index = ref_index + 1          output.append(              RefStructured(                  biblio=RefBiblio( -                    unstructured=ref.get("unstructured"), -                    title=ref.get("title"), +                    unstructured=ref.unstructured, +                    title=ref.title,                      # subtitle                      contrib_raw_names=authors or None,                      year=ref_year, -                    container_name=ref.get("journal"), -                    publisher=ref.get("publisher"), -                    volume=ref.get("volume"), -                    issue=ref.get("issue"), -                    pages=ref.get("pages"), -                    doi=clean_doi(ref.get("doi")), -                    pmid=ref.get("pmid"), -                    pmcid=clean_pmcid(ref.get("pmcid")), -                    arxiv_id=ref.get("arxiv_id"), -                    isbn=ref.get("isbn"), -                    url=clean_url_conservative(ref.get("url")), +                    container_name=ref.journal, +                    publisher=ref.publisher, +                    volume=ref.volume, +                    issue=ref.issue, +                    pages=ref.pages, +                    doi=clean_doi(ref.doi), +                    pmid=ref.pmid, +                    pmcid=clean_pmcid(ref.pmcid), +                    arxiv_id=ref.arxiv_id, +                    url=clean_url_conservative(ref.url),                  ),                  release_ident=release.ident,                  work_ident=release.work_id,                  release_stage=release.release_stage,                  release_year=release.release_year,                  index=ref_index, -                key=clean_ref_key(ref.get("id")), +                key=clean_ref_key(ref.id),                  locator=None,                  # target_release_id                  ref_source="grobid", @@ -902,8 +901,7 @@ def refs_from_heavy(heavy: IntermediateBundle) -> Sequence[RefStructured]:              if r.ident == heavy.grobid_fulltext["release_ident"]          ][0]          tei_doc = parse_document_xml(heavy.grobid_fulltext["tei_xml"]) -        tei_dict = tei_doc.to_legacy_dict() -        fulltext_refs = refs_from_grobid(fulltext_release, tei_dict) +        fulltext_refs = refs_from_grobid(fulltext_release, tei_doc)      crossref_refs: List[RefStructured] = []      if heavy.crossref: diff --git a/tests/test_grobid2json.py b/tests/test_grobid_parse.py index adf36a1..c0adf9b 100644 --- a/tests/test_grobid2json.py +++ b/tests/test_grobid_parse.py @@ -1,7 +1,7 @@  from grobid_tei_xml import parse_document_xml -def test_grobid_parse() -> None: +def test_grobid_parse_legacy() -> None:      """      This function formerly tested the grobid2json file in this project. Now it      tests backwards-compatibility of the grobid_tei_xml library. @@ -29,3 +29,34 @@ def test_grobid_parse() -> None:          ref["unstructured"]          == "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19."      ) + + +def test_grobid_parse() -> None: +    """ +    Equivalent to test_grobid_parse_legacy(), but using the GrobidDocument type directly +    """ + +    with open("tests/files/example_grobid.tei.xml", "r") as f: +        blob = f.read() + +    doc = parse_document_xml(blob) + +    assert ( +        doc.header.title +        == "Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network" +    ) + +    assert doc.citations is not None +    ref = [c for c in doc.citations if c.id == "b12"][0] +    assert ref.authors[0].given_name == "K" +    assert ref.authors[0].full_name == "K Tasa" +    assert ref.authors[0].surname == "Tasa" +    assert ref.journal == "Quality Management in Health Care" +    assert ref.title == "Using patient feedback for quality improvement" +    assert ref.date == "1996" +    assert ref.pages == "206-225" +    assert ref.volume == "8" +    assert ( +        ref.unstructured +        == "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19." +    ) diff --git a/tests/test_refs_transform.py b/tests/test_refs_transform.py index 2fc210f..4b1b588 100644 --- a/tests/test_refs_transform.py +++ b/tests/test_refs_transform.py @@ -20,8 +20,7 @@ def test_transform_refs_grobid() -> None:      )      tei_doc = parse_document_xml(blob) -    tei_dict = tei_doc.to_legacy_dict() -    refs = refs_from_grobid(dummy_release, tei_dict) +    refs = refs_from_grobid(dummy_release, tei_doc)      ref = refs[12]      assert ref.release_ident == "releasedummy22222222222222" | 
