summaryrefslogtreecommitdiffstats
path: root/fatcat_scholar
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-01-30 19:29:20 -0800
committerBryan Newbold <bnewbold@archive.org>2021-01-30 19:29:20 -0800
commit5211f3d70a878d32d8381aed9d7b582d9d439c3f (patch)
treeecef79f3a2be063538c3467b33055b32a945c085 /fatcat_scholar
parente86414a774d4a0c6db62110f04a6c96365afbf6c (diff)
downloadfatcat-scholar-5211f3d70a878d32d8381aed9d7b582d9d439c3f.tar.gz
fatcat-scholar-5211f3d70a878d32d8381aed9d7b582d9d439c3f.zip
catch TEI-XML parsing exception
Diffstat (limited to 'fatcat_scholar')
-rw-r--r--fatcat_scholar/transform.py29
1 files changed, 17 insertions, 12 deletions
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index a2cdcb9..cbf0a89 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -1,6 +1,7 @@
import sys
import argparse
import datetime
+import xml.etree.ElementTree
import xml.etree.ElementTree as ET
from typing import List, Dict, Optional, Any, Sequence
@@ -479,18 +480,22 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
for f in fulltext_release.files
if f.ident == heavy.grobid_fulltext["file_ident"]
][0]
- tei_dict = teixml2json(heavy.grobid_fulltext["tei_xml"])
- if not abstracts:
- abstracts = es_abstracts_from_grobid(tei_dict)
- grobid_fulltext = es_fulltext_from_grobid(
- tei_dict, heavy.pdf_meta, fulltext_release, fulltext_file
- )
- if exclude_web_fulltext and grobid_fulltext:
- if not fulltext:
- # include only partial fulltext object, with no access
- fulltext = grobid_fulltext.remove_access()
- else:
- fulltext = grobid_fulltext
+ try:
+ tei_dict: Optional[dict] = teixml2json(heavy.grobid_fulltext["tei_xml"])
+ except xml.etree.ElementTree.ParseError:
+ tei_dict = None
+ if tei_dict:
+ if not abstracts:
+ abstracts = es_abstracts_from_grobid(tei_dict)
+ grobid_fulltext = es_fulltext_from_grobid(
+ tei_dict, heavy.pdf_meta, fulltext_release, fulltext_file
+ )
+ if exclude_web_fulltext and grobid_fulltext:
+ if not fulltext:
+ # include only partial fulltext object, with no access
+ fulltext = grobid_fulltext.remove_access()
+ else:
+ fulltext = grobid_fulltext
if not fulltext and heavy.pdftotext_fulltext:
fulltext_release = [