1 files changed, 33 insertions, 18 deletions
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index 4114885..cbf0a89 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -1,14 +1,16 @@
 import sys
 import argparse
 import datetime
+import xml.etree.ElementTree
 import xml.etree.ElementTree as ET
 from typing import List, Dict, Optional, Any, Sequence
 
+import sentry_sdk
 from fatcat_openapi_client import ReleaseEntity, FileEntity, WebcaptureEntity
 
 from fatcat_scholar.api_entities import *
 from fatcat_scholar.schema import *
-from fatcat_scholar.config import settings
+from fatcat_scholar.config import settings, GIT_REVISION
 from fatcat_scholar.grobid2json import teixml2json
 
 
@@ -138,11 +140,10 @@ def es_biblio_from_sim(sim: Dict[str, Any]) -> ScholarBiblio:
     if release_year and abs(release_year) > 2050:
         release_year = None
 
-    lang_code = SIM_LANG_MAP.get(issue_meta.get("language")) or SIM_LANG_MAP.get(
-        pub_meta.get("language")
-    )
-    if isinstance(lang_code, list):
-        lang_code = lang_code[0]
+    language = issue_meta.get("language") or pub_meta.get("language")
+    if isinstance(language, list):
+        language = language[0]
+    lang_code = SIM_LANG_MAP.get(language)
 
     return ScholarBiblio(
         # release_ident=release.ident,
@@ -479,18 +480,22 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
             for f in fulltext_release.files
             if f.ident == heavy.grobid_fulltext["file_ident"]
         ][0]
-        tei_dict = teixml2json(heavy.grobid_fulltext["tei_xml"])
-        if not abstracts:
-            abstracts = es_abstracts_from_grobid(tei_dict)
-        grobid_fulltext = es_fulltext_from_grobid(
-            tei_dict, heavy.pdf_meta, fulltext_release, fulltext_file
-        )
-        if exclude_web_fulltext and grobid_fulltext:
-            if not fulltext:
-                # include only partial fulltext object, with no access
-                fulltext = grobid_fulltext.remove_access()
-        else:
-            fulltext = grobid_fulltext
+        try:
+            tei_dict: Optional[dict] = teixml2json(heavy.grobid_fulltext["tei_xml"])
+        except xml.etree.ElementTree.ParseError:
+            tei_dict = None
+        if tei_dict:
+            if not abstracts:
+                abstracts = es_abstracts_from_grobid(tei_dict)
+            grobid_fulltext = es_fulltext_from_grobid(
+                tei_dict, heavy.pdf_meta, fulltext_release, fulltext_file
+            )
+            if exclude_web_fulltext and grobid_fulltext:
+                if not fulltext:
+                    # include only partial fulltext object, with no access
+                    fulltext = grobid_fulltext.remove_access()
+            else:
+                fulltext = grobid_fulltext
 
     if not fulltext and heavy.pdftotext_fulltext:
         fulltext_release = [
@@ -799,6 +804,16 @@ def main() -> None:
         parser.print_help(file=sys.stderr)
         sys.exit(-1)
 
+    # enable sentry exception catching; this helps a lot with debugging bulk
+    # transform runs
+    if settings.SENTRY_DSN:
+        sentry_sdk.init(
+            dsn=settings.SENTRY_DSN,
+            environment=settings.SCHOLAR_ENV,
+            max_breadcrumbs=10,
+            release=GIT_REVISION,
+        )
+
     if args.func == "run_transform":
         run_transform(infile=args.json_file)
     elif args.func == "run_refs":