diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-01-30 19:29:20 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-01-30 19:29:20 -0800 |
commit | 5211f3d70a878d32d8381aed9d7b582d9d439c3f (patch) | |
tree | ecef79f3a2be063538c3467b33055b32a945c085 | |
parent | e86414a774d4a0c6db62110f04a6c96365afbf6c (diff) | |
download | fatcat-scholar-5211f3d70a878d32d8381aed9d7b582d9d439c3f.tar.gz fatcat-scholar-5211f3d70a878d32d8381aed9d7b582d9d439c3f.zip |
catch TEI-XML parsing exception
-rw-r--r-- | fatcat_scholar/transform.py | 29 |
1 files changed, 17 insertions, 12 deletions
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index a2cdcb9..cbf0a89 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -1,6 +1,7 @@ import sys import argparse import datetime +import xml.etree.ElementTree import xml.etree.ElementTree as ET from typing import List, Dict, Optional, Any, Sequence @@ -479,18 +480,22 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: for f in fulltext_release.files if f.ident == heavy.grobid_fulltext["file_ident"] ][0] - tei_dict = teixml2json(heavy.grobid_fulltext["tei_xml"]) - if not abstracts: - abstracts = es_abstracts_from_grobid(tei_dict) - grobid_fulltext = es_fulltext_from_grobid( - tei_dict, heavy.pdf_meta, fulltext_release, fulltext_file - ) - if exclude_web_fulltext and grobid_fulltext: - if not fulltext: - # include only partial fulltext object, with no access - fulltext = grobid_fulltext.remove_access() - else: - fulltext = grobid_fulltext + try: + tei_dict: Optional[dict] = teixml2json(heavy.grobid_fulltext["tei_xml"]) + except xml.etree.ElementTree.ParseError: + tei_dict = None + if tei_dict: + if not abstracts: + abstracts = es_abstracts_from_grobid(tei_dict) + grobid_fulltext = es_fulltext_from_grobid( + tei_dict, heavy.pdf_meta, fulltext_release, fulltext_file + ) + if exclude_web_fulltext and grobid_fulltext: + if not fulltext: + # include only partial fulltext object, with no access + fulltext = grobid_fulltext.remove_access() + else: + fulltext = grobid_fulltext if not fulltext and heavy.pdftotext_fulltext: fulltext_release = [ |