aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/transform.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-27 15:33:29 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-27 18:25:58 -0700
commit33211915773a0c77d064c55c1b02ceed6f455feb (patch)
tree1828505db917686e7223d41e97c6446223f2da32 /fatcat_scholar/transform.py
parent6c103e4dc48e7e0c0f6cdedc18b0afe33babf1ac (diff)
downloadfatcat-scholar-33211915773a0c77d064c55c1b02ceed6f455feb.tar.gz
fatcat-scholar-33211915773a0c77d064c55c1b02ceed6f455feb.zip
replace grobid2json with grobid_tei_xml
This first iteration uses the .to_legacy_dict() helpers for backwards compatibility
Diffstat (limited to 'fatcat_scholar/transform.py')
-rw-r--r--fatcat_scholar/transform.py8
1 files changed, 5 insertions, 3 deletions
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index f805e7b..caeff21 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -8,9 +8,9 @@ from typing import Any, Dict, List, Optional, Sequence
import sentry_sdk
from fatcat_openapi_client import FileEntity, ReleaseEntity, WebcaptureEntity
+from grobid_tei_xml import parse_document_xml
from fatcat_scholar.config import GIT_REVISION, settings
-from fatcat_scholar.grobid2json import teixml2json
from fatcat_scholar.identifiers import clean_doi, clean_pmcid
from fatcat_scholar.schema import (
AccessType,
@@ -521,7 +521,8 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
if f.ident == heavy.grobid_fulltext["file_ident"]
][0]
try:
- tei_dict: Optional[dict] = teixml2json(heavy.grobid_fulltext["tei_xml"])
+ tei_doc = parse_document_xml(heavy.grobid_fulltext["tei_xml"])
+ tei_dict = tei_doc.to_legacy_dict()
except xml.etree.ElementTree.ParseError:
tei_dict = None
if tei_dict:
@@ -900,7 +901,8 @@ def refs_from_heavy(heavy: IntermediateBundle) -> Sequence[RefStructured]:
for r in heavy.releases
if r.ident == heavy.grobid_fulltext["release_ident"]
][0]
- tei_dict = teixml2json(heavy.grobid_fulltext["tei_xml"])
+ tei_doc = parse_document_xml(heavy.grobid_fulltext["tei_xml"])
+ tei_dict = tei_doc.to_legacy_dict()
fulltext_refs = refs_from_grobid(fulltext_release, tei_dict)
crossref_refs: List[RefStructured] = []