aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-19 23:54:54 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-19 23:54:54 -0800
commitc5436c73e2b75a9a936d53575caf66de26ef9195 (patch)
treef72c7664a55742bfb005d66c06d89033f0931bca /python
parent5537b666ad392fb13aa956ebff4e7aa0927b68ee (diff)
downloadsandcrawler-c5436c73e2b75a9a936d53575caf66de26ef9195.tar.gz
sandcrawler-c5436c73e2b75a9a936d53575caf66de26ef9195.zip
xml: catch parse error
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ingest.py11
1 files changed, 8 insertions, 3 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index e77ec05..abcc156 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -4,10 +4,12 @@ import json
import gzip
import time
import base64
-import requests
+import xml.etree.ElementTree
+from collections import namedtuple
from typing import Optional, Tuple, Any, Dict, List
from http.server import BaseHTTPRequestHandler, HTTPServer
-from collections import namedtuple
+
+import requests
from selectolax.parser import HTMLParser
from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackContentError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult, fix_transfer_encoding, NoCaptureError
@@ -330,7 +332,10 @@ class IngestFileWorker(SandcrawlerWorker):
count), or attempting to fetch sub-resources.
"""
if self.xmldoc_sink and file_meta['mimetype'] == "application/jats+xml":
- jats_xml = xml_reserialize(resource.body)
+ try:
+ jats_xml = xml_reserialize(resource.body)
+ except xml.etree.ElementTree.ParseError:
+ return dict(status="xml-parse-error")
msg = dict(
sha1hex=file_meta["sha1hex"],
status="success",