From 5211f3d70a878d32d8381aed9d7b582d9d439c3f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sat, 30 Jan 2021 19:29:20 -0800 Subject: catch TEI-XML parsing exception --- fatcat_scholar/transform.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'fatcat_scholar') diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index a2cdcb9..cbf0a89 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -1,6 +1,7 @@ import sys import argparse import datetime +import xml.etree.ElementTree import xml.etree.ElementTree as ET from typing import List, Dict, Optional, Any, Sequence @@ -479,18 +480,22 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: for f in fulltext_release.files if f.ident == heavy.grobid_fulltext["file_ident"] ][0] - tei_dict = teixml2json(heavy.grobid_fulltext["tei_xml"]) - if not abstracts: - abstracts = es_abstracts_from_grobid(tei_dict) - grobid_fulltext = es_fulltext_from_grobid( - tei_dict, heavy.pdf_meta, fulltext_release, fulltext_file - ) - if exclude_web_fulltext and grobid_fulltext: - if not fulltext: - # include only partial fulltext object, with no access - fulltext = grobid_fulltext.remove_access() - else: - fulltext = grobid_fulltext + try: + tei_dict: Optional[dict] = teixml2json(heavy.grobid_fulltext["tei_xml"]) + except xml.etree.ElementTree.ParseError: + tei_dict = None + if tei_dict: + if not abstracts: + abstracts = es_abstracts_from_grobid(tei_dict) + grobid_fulltext = es_fulltext_from_grobid( + tei_dict, heavy.pdf_meta, fulltext_release, fulltext_file + ) + if exclude_web_fulltext and grobid_fulltext: + if not fulltext: + # include only partial fulltext object, with no access + fulltext = grobid_fulltext.remove_access() + else: + fulltext = grobid_fulltext if not fulltext and heavy.pdftotext_fulltext: fulltext_release = [ -- cgit v1.2.3