diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-27 18:24:19 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-27 18:25:58 -0700 |
commit | 560d5f7cc1672f95e2a953ab5908f4205151a703 (patch) | |
tree | 04b35084358786bbd2329491be07cde35a4d2289 | |
parent | 33211915773a0c77d064c55c1b02ceed6f455feb (diff) | |
download | fatcat-scholar-560d5f7cc1672f95e2a953ab5908f4205151a703.tar.gz fatcat-scholar-560d5f7cc1672f95e2a953ab5908f4205151a703.zip |
refactor use of grobid_tei_xml
-rw-r--r-- | fatcat_scholar/query_citation.py | 52 | ||||
-rw-r--r-- | fatcat_scholar/schema.py | 9 | ||||
-rw-r--r-- | fatcat_scholar/transform.py | 80 | ||||
-rw-r--r-- | tests/test_grobid_parse.py (renamed from tests/test_grobid2json.py) | 33 | ||||
-rw-r--r-- | tests/test_refs_transform.py | 3 |
5 files changed, 102 insertions, 75 deletions
diff --git a/fatcat_scholar/query_citation.py b/fatcat_scholar/query_citation.py index dea4f02..0233acc 100644 --- a/fatcat_scholar/query_citation.py +++ b/fatcat_scholar/query_citation.py @@ -11,14 +11,14 @@ parallel with "regular" query? """ import sys -from typing import Any, Optional, Tuple +from typing import Any, List, Optional, Tuple import fuzzycat.common import fuzzycat.verify import requests from fatcat_openapi_client import ReleaseContrib, ReleaseEntity, ReleaseExtIds from fuzzycat.matching import match_release_fuzzy -from grobid_tei_xml import parse_citations_xml +from grobid_tei_xml import GrobidBiblio, parse_citations_xml from fatcat_scholar.api_entities import entity_to_dict @@ -44,44 +44,42 @@ def grobid_process_citation( return grobid_response.text -def transform_grobid(raw_xml: str) -> Optional[dict]: - ref_list = parse_citations_xml(raw_xml) - if not ref_list: +def transform_grobid(raw_xml: str) -> Optional[GrobidBiblio]: + ref_list: List[GrobidBiblio] = parse_citations_xml(raw_xml) + # check for unmatched or empty references + if not ref_list or not ref_list[0].to_dict(): return None - ref = ref_list[0] - if not any(ref.values()): - return None - return ref + return ref_list[0] -def ref_to_release(ref: dict) -> ReleaseEntity: +def ref_to_release(ref: GrobidBiblio) -> ReleaseEntity: contribs = [] - for author in ref.get("authors") or []: + for author in ref.authors or []: contribs.append( ReleaseContrib( - raw_name=author.get("name"), - given_name=author.get("given_name"), - surname=author.get("surname"), + raw_name=author.full_name, + given_name=author.given_name, + surname=author.surname, ) ) release = ReleaseEntity( - title=ref.get("title"), + title=ref.title, contribs=contribs, - volume=ref.get("volume"), - issue=ref.get("issue"), - pages=ref.get("pages"), + volume=ref.volume, + issue=ref.issue, + pages=ref.pages, ext_ids=ReleaseExtIds( - doi=ref.get("doi"), - pmid=ref.get("pmid"), - pmcid=ref.get("pmcid"), - arxiv=ref.get("arxiv_id"), + doi=ref.doi, + pmid=ref.pmid, + pmcid=ref.pmcid, + arxiv=ref.arxiv_id, ), ) - if ref.get("journal"): - release.extra = {"container_name": ref.get("journal")} - if ref.get("date"): - if len(ref["date"]) == 4 and ref["date"].isdigit(): - release.release_year = int(ref["date"]) + if ref.journal: + release.extra = {"container_name": ref.journal} + if ref.date: + if len(ref.date) == 4 and ref.date.isdigit(): + release.release_year = int(ref.date) return release diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index 6c9307d..b170f12 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -13,6 +13,7 @@ from typing import Any, Dict, List, Optional import ftfy from bs4 import BeautifulSoup from fatcat_openapi_client import ReleaseContrib, ReleaseEntity +from grobid_tei_xml import GrobidDocument # pytype: disable=import-error from pydantic import BaseModel @@ -494,12 +495,12 @@ def contrib_affiliation(contrib: ReleaseContrib) -> Optional[str]: return None -def es_abstracts_from_grobid(tei_dict: dict) -> List[ScholarAbstract]: +def es_abstracts_from_grobid(tei_doc: GrobidDocument) -> List[ScholarAbstract]: - if tei_dict.get("abstract"): - body = scrub_text(tei_dict["abstract"]) + if tei_doc.abstract: + body = scrub_text(tei_doc.abstract) if body: - return [ScholarAbstract(lang_code=tei_dict.get("lang"), body=body)] + return [ScholarAbstract(lang_code=tei_doc.language_code, body=body)] return [] diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index caeff21..d40e123 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -8,7 +8,7 @@ from typing import Any, Dict, List, Optional, Sequence import sentry_sdk from fatcat_openapi_client import FileEntity, ReleaseEntity, WebcaptureEntity -from grobid_tei_xml import parse_document_xml +from grobid_tei_xml import GrobidBiblio, GrobidDocument, parse_document_xml from fatcat_scholar.config import GIT_REVISION, settings from fatcat_scholar.identifiers import clean_doi, clean_pmcid @@ -241,18 +241,18 @@ def _add_file_release_meta( def es_fulltext_from_grobid( - tei_dict: dict, pdf_meta: Optional[dict], re: ReleaseEntity, fe: FileEntity + tei_doc: GrobidDocument, pdf_meta: Optional[dict], re: ReleaseEntity, fe: FileEntity ) -> Optional[ScholarFulltext]: - if not tei_dict.get("body"): + if not tei_doc.body: return None - body = tei_dict.get("body") + body = tei_doc.body if body and len(body) > MAX_BODY_CHARS: body = body[:MAX_BODY_CHARS] ret = ScholarFulltext( - lang_code=tei_dict.get("lang"), + lang_code=tei_doc.language_code, body=body, - acknowledgement=tei_dict.get("acknowledgement"), - annex=tei_dict.get("annex"), + acknowledgement=tei_doc.acknowledgement, + annex=tei_doc.annex, ) return _add_file_release_meta(ret, pdf_meta, re, fe) @@ -521,15 +521,16 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: if f.ident == heavy.grobid_fulltext["file_ident"] ][0] try: - tei_doc = parse_document_xml(heavy.grobid_fulltext["tei_xml"]) - tei_dict = tei_doc.to_legacy_dict() + tei_doc: Optional[GrobidDocument] = parse_document_xml( + heavy.grobid_fulltext["tei_xml"] + ) except xml.etree.ElementTree.ParseError: - tei_dict = None - if tei_dict: + tei_doc = None + if tei_doc: if not abstracts: - abstracts = es_abstracts_from_grobid(tei_dict) + abstracts = es_abstracts_from_grobid(tei_doc) grobid_fulltext = es_fulltext_from_grobid( - tei_dict, heavy.pdf_meta, fulltext_release, fulltext_file + tei_doc, heavy.pdf_meta, fulltext_release, fulltext_file ) if exclude_web_fulltext and grobid_fulltext: if not fulltext: @@ -681,52 +682,50 @@ def test_clean_ref_key() -> None: assert clean_ref_key(raw, doi=doi) == expected -def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> List[RefStructured]: +def refs_from_grobid( + release: ReleaseEntity, tei_doc: GrobidDocument +) -> List[RefStructured]: output = [] - for ref in tei_dict.get("citations") or []: - ref_date = ref.get("date") or None + ref: GrobidBiblio + for ref in tei_doc.citations or []: + ref_date = ref.date or None ref_year: Optional[int] = None if ref_date and len(ref_date) >= 4 and ref_date[:4].isdigit(): ref_year = int(ref_date[:4]) - ref_authors = ref.get("authors") or [] authors: List[str] = [] - for a in ref_authors: - if isinstance(a, str): - authors.append(a) - elif isinstance(a, dict): - if a.get("name"): - assert isinstance(a["name"], str) - authors.append(a["name"]) - ref_index = ref.get("index") + for a in ref.authors or []: + if a.full_name: + assert isinstance(a.full_name, str) + authors.append(a.full_name) + ref_index = ref.index if ref_index is not None: # transform from 0-indexed to 1-indexed ref_index = ref_index + 1 output.append( RefStructured( biblio=RefBiblio( - unstructured=ref.get("unstructured"), - title=ref.get("title"), + unstructured=ref.unstructured, + title=ref.title, # subtitle contrib_raw_names=authors or None, year=ref_year, - container_name=ref.get("journal"), - publisher=ref.get("publisher"), - volume=ref.get("volume"), - issue=ref.get("issue"), - pages=ref.get("pages"), - doi=clean_doi(ref.get("doi")), - pmid=ref.get("pmid"), - pmcid=clean_pmcid(ref.get("pmcid")), - arxiv_id=ref.get("arxiv_id"), - isbn=ref.get("isbn"), - url=clean_url_conservative(ref.get("url")), + container_name=ref.journal, + publisher=ref.publisher, + volume=ref.volume, + issue=ref.issue, + pages=ref.pages, + doi=clean_doi(ref.doi), + pmid=ref.pmid, + pmcid=clean_pmcid(ref.pmcid), + arxiv_id=ref.arxiv_id, + url=clean_url_conservative(ref.url), ), release_ident=release.ident, work_ident=release.work_id, release_stage=release.release_stage, release_year=release.release_year, index=ref_index, - key=clean_ref_key(ref.get("id")), + key=clean_ref_key(ref.id), locator=None, # target_release_id ref_source="grobid", @@ -902,8 +901,7 @@ def refs_from_heavy(heavy: IntermediateBundle) -> Sequence[RefStructured]: if r.ident == heavy.grobid_fulltext["release_ident"] ][0] tei_doc = parse_document_xml(heavy.grobid_fulltext["tei_xml"]) - tei_dict = tei_doc.to_legacy_dict() - fulltext_refs = refs_from_grobid(fulltext_release, tei_dict) + fulltext_refs = refs_from_grobid(fulltext_release, tei_doc) crossref_refs: List[RefStructured] = [] if heavy.crossref: diff --git a/tests/test_grobid2json.py b/tests/test_grobid_parse.py index adf36a1..c0adf9b 100644 --- a/tests/test_grobid2json.py +++ b/tests/test_grobid_parse.py @@ -1,7 +1,7 @@ from grobid_tei_xml import parse_document_xml -def test_grobid_parse() -> None: +def test_grobid_parse_legacy() -> None: """ This function formerly tested the grobid2json file in this project. Now it tests backwards-compatibility of the grobid_tei_xml library. @@ -29,3 +29,34 @@ def test_grobid_parse() -> None: ref["unstructured"] == "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19." ) + + +def test_grobid_parse() -> None: + """ + Equivalent to test_grobid_parse_legacy(), but using the GrobidDocument type directly + """ + + with open("tests/files/example_grobid.tei.xml", "r") as f: + blob = f.read() + + doc = parse_document_xml(blob) + + assert ( + doc.header.title + == "Changes of patients' satisfaction with the health care services in Lithuanian Health Promoting Hospitals network" + ) + + assert doc.citations is not None + ref = [c for c in doc.citations if c.id == "b12"][0] + assert ref.authors[0].given_name == "K" + assert ref.authors[0].full_name == "K Tasa" + assert ref.authors[0].surname == "Tasa" + assert ref.journal == "Quality Management in Health Care" + assert ref.title == "Using patient feedback for quality improvement" + assert ref.date == "1996" + assert ref.pages == "206-225" + assert ref.volume == "8" + assert ( + ref.unstructured + == "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19." + ) diff --git a/tests/test_refs_transform.py b/tests/test_refs_transform.py index 2fc210f..4b1b588 100644 --- a/tests/test_refs_transform.py +++ b/tests/test_refs_transform.py @@ -20,8 +20,7 @@ def test_transform_refs_grobid() -> None: ) tei_doc = parse_document_xml(blob) - tei_dict = tei_doc.to_legacy_dict() - refs = refs_from_grobid(dummy_release, tei_dict) + refs = refs_from_grobid(dummy_release, tei_doc) ref = refs[12] assert ref.release_ident == "releasedummy22222222222222" |