diff options
Diffstat (limited to 'fatcat_scholar/transform.py')
-rw-r--r-- | fatcat_scholar/transform.py | 51 |
1 files changed, 33 insertions, 18 deletions
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index 4114885..cbf0a89 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -1,14 +1,16 @@ import sys import argparse import datetime +import xml.etree.ElementTree import xml.etree.ElementTree as ET from typing import List, Dict, Optional, Any, Sequence +import sentry_sdk from fatcat_openapi_client import ReleaseEntity, FileEntity, WebcaptureEntity from fatcat_scholar.api_entities import * from fatcat_scholar.schema import * -from fatcat_scholar.config import settings +from fatcat_scholar.config import settings, GIT_REVISION from fatcat_scholar.grobid2json import teixml2json @@ -138,11 +140,10 @@ def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio: if release_year and abs(release_year) > 2050: release_year = None - lang_code = SIM_LANG_MAP.get(issue_meta.get("language")) or SIM_LANG_MAP.get( - pub_meta.get("language") - ) - if isinstance(lang_code, list): - lang_code = lang_code[0] + language = issue_meta.get("language") or pub_meta.get("language") + if isinstance(language, list): + language = language[0] + lang_code = SIM_LANG_MAP.get(language) return ScholarBiblio( # release_ident=release.ident, @@ -479,18 +480,22 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: for f in fulltext_release.files if f.ident == heavy.grobid_fulltext["file_ident"] ][0] - tei_dict = teixml2json(heavy.grobid_fulltext["tei_xml"]) - if not abstracts: - abstracts = es_abstracts_from_grobid(tei_dict) - grobid_fulltext = es_fulltext_from_grobid( - tei_dict, heavy.pdf_meta, fulltext_release, fulltext_file - ) - if exclude_web_fulltext and grobid_fulltext: - if not fulltext: - # include only partial fulltext object, with no access - fulltext = grobid_fulltext.remove_access() - else: - fulltext = grobid_fulltext + try: + tei_dict: Optional[dict] = teixml2json(heavy.grobid_fulltext["tei_xml"]) + except xml.etree.ElementTree.ParseError: + tei_dict = None + if tei_dict: + if not abstracts: + abstracts = es_abstracts_from_grobid(tei_dict) + grobid_fulltext = es_fulltext_from_grobid( + tei_dict, heavy.pdf_meta, fulltext_release, fulltext_file + ) + if exclude_web_fulltext and grobid_fulltext: + if not fulltext: + # include only partial fulltext object, with no access + fulltext = grobid_fulltext.remove_access() + else: + fulltext = grobid_fulltext if not fulltext and heavy.pdftotext_fulltext: fulltext_release = [ @@ -799,6 +804,16 @@ def main() -> None: parser.print_help(file=sys.stderr) sys.exit(-1) + # enable sentry exception catching; this helps a lot with debugging bulk + # transform runs + if settings.SENTRY_DSN: + sentry_sdk.init( + dsn=settings.SENTRY_DSN, + environment=settings.SCHOLAR_ENV, + max_breadcrumbs=10, + release=GIT_REVISION, + ) + if args.func == "run_transform": run_transform(infile=args.json_file) elif args.func == "run_refs": |