catch TEI-XML parsing exception

author: Bryan Newbold <bnewbold@archive.org> 2021-01-30 19:29:20 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2021-01-30 19:29:20 -0800
commit: 5211f3d70a878d32d8381aed9d7b582d9d439c3f (patch)
tree: ecef79f3a2be063538c3467b33055b32a945c085 /fatcat_scholar
parent: e86414a774d4a0c6db62110f04a6c96365afbf6c (diff)
download: fatcat-scholar-5211f3d70a878d32d8381aed9d7b582d9d439c3f.tar.gz
fatcat-scholar-5211f3d70a878d32d8381aed9d7b582d9d439c3f.zip
1 files changed, 17 insertions, 12 deletions
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index a2cdcb9..cbf0a89 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -1,6 +1,7 @@
 import sys
 import argparse
 import datetime
+import xml.etree.ElementTree
 import xml.etree.ElementTree as ET
 from typing import List, Dict, Optional, Any, Sequence
 
@@ -479,18 +480,22 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
             for f in fulltext_release.files
             if f.ident == heavy.grobid_fulltext["file_ident"]
         ][0]
-        tei_dict = teixml2json(heavy.grobid_fulltext["tei_xml"])
-        if not abstracts:
-            abstracts = es_abstracts_from_grobid(tei_dict)
-        grobid_fulltext = es_fulltext_from_grobid(
-            tei_dict, heavy.pdf_meta, fulltext_release, fulltext_file
-        )
-        if exclude_web_fulltext and grobid_fulltext:
-            if not fulltext:
-                # include only partial fulltext object, with no access
-                fulltext = grobid_fulltext.remove_access()
-        else:
-            fulltext = grobid_fulltext
+        try:
+            tei_dict: Optional[dict] = teixml2json(heavy.grobid_fulltext["tei_xml"])
+        except xml.etree.ElementTree.ParseError:
+            tei_dict = None
+        if tei_dict:
+            if not abstracts:
+                abstracts = es_abstracts_from_grobid(tei_dict)
+            grobid_fulltext = es_fulltext_from_grobid(
+                tei_dict, heavy.pdf_meta, fulltext_release, fulltext_file
+            )
+            if exclude_web_fulltext and grobid_fulltext:
+                if not fulltext:
+                    # include only partial fulltext object, with no access
+                    fulltext = grobid_fulltext.remove_access()
+            else:
+                fulltext = grobid_fulltext
 
     if not fulltext and heavy.pdftotext_fulltext:
         fulltext_release = [
author	Bryan Newbold <bnewbold@archive.org>	2021-01-30 19:29:20 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2021-01-30 19:29:20 -0800
commit	5211f3d70a878d32d8381aed9d7b582d9d439c3f (patch)
tree	ecef79f3a2be063538c3467b33055b32a945c085 /fatcat_scholar
parent	e86414a774d4a0c6db62110f04a6c96365afbf6c (diff)
download	fatcat-scholar-5211f3d70a878d32d8381aed9d7b582d9d439c3f.tar.gz fatcat-scholar-5211f3d70a878d32d8381aed9d7b582d9d439c3f.zip