diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-27 15:33:29 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-27 18:25:58 -0700 |
commit | 33211915773a0c77d064c55c1b02ceed6f455feb (patch) | |
tree | 1828505db917686e7223d41e97c6446223f2da32 /fatcat_scholar/transform.py | |
parent | 6c103e4dc48e7e0c0f6cdedc18b0afe33babf1ac (diff) | |
download | fatcat-scholar-33211915773a0c77d064c55c1b02ceed6f455feb.tar.gz fatcat-scholar-33211915773a0c77d064c55c1b02ceed6f455feb.zip |
replace grobid2json with grobid_tei_xml
This first iteration uses the .to_legacy_dict() helpers for backwards
compatibility
Diffstat (limited to 'fatcat_scholar/transform.py')
-rw-r--r-- | fatcat_scholar/transform.py | 8 |
1 files changed, 5 insertions, 3 deletions
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index f805e7b..caeff21 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -8,9 +8,9 @@ from typing import Any, Dict, List, Optional, Sequence import sentry_sdk from fatcat_openapi_client import FileEntity, ReleaseEntity, WebcaptureEntity +from grobid_tei_xml import parse_document_xml from fatcat_scholar.config import GIT_REVISION, settings -from fatcat_scholar.grobid2json import teixml2json from fatcat_scholar.identifiers import clean_doi, clean_pmcid from fatcat_scholar.schema import ( AccessType, @@ -521,7 +521,8 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: if f.ident == heavy.grobid_fulltext["file_ident"] ][0] try: - tei_dict: Optional[dict] = teixml2json(heavy.grobid_fulltext["tei_xml"]) + tei_doc = parse_document_xml(heavy.grobid_fulltext["tei_xml"]) + tei_dict = tei_doc.to_legacy_dict() except xml.etree.ElementTree.ParseError: tei_dict = None if tei_dict: @@ -900,7 +901,8 @@ def refs_from_heavy(heavy: IntermediateBundle) -> Sequence[RefStructured]: for r in heavy.releases if r.ident == heavy.grobid_fulltext["release_ident"] ][0] - tei_dict = teixml2json(heavy.grobid_fulltext["tei_xml"]) + tei_doc = parse_document_xml(heavy.grobid_fulltext["tei_xml"]) + tei_dict = tei_doc.to_legacy_dict() fulltext_refs = refs_from_grobid(fulltext_release, tei_dict) crossref_refs: List[RefStructured] = [] |