diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-19 23:54:54 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-19 23:54:54 -0800 |
commit | c5436c73e2b75a9a936d53575caf66de26ef9195 (patch) | |
tree | f72c7664a55742bfb005d66c06d89033f0931bca /python | |
parent | 5537b666ad392fb13aa956ebff4e7aa0927b68ee (diff) | |
download | sandcrawler-c5436c73e2b75a9a936d53575caf66de26ef9195.tar.gz sandcrawler-c5436c73e2b75a9a936d53575caf66de26ef9195.zip |
xml: catch parse error
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ingest.py | 11 |
1 files changed, 8 insertions, 3 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index e77ec05..abcc156 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -4,10 +4,12 @@ import json import gzip import time import base64 -import requests +import xml.etree.ElementTree +from collections import namedtuple from typing import Optional, Tuple, Any, Dict, List from http.server import BaseHTTPRequestHandler, HTTPServer -from collections import namedtuple + +import requests from selectolax.parser import HTMLParser from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackContentError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult, fix_transfer_encoding, NoCaptureError @@ -330,7 +332,10 @@ class IngestFileWorker(SandcrawlerWorker): count), or attempting to fetch sub-resources. """ if self.xmldoc_sink and file_meta['mimetype'] == "application/jats+xml": - jats_xml = xml_reserialize(resource.body) + try: + jats_xml = xml_reserialize(resource.body) + except xml.etree.ElementTree.ParseError: + return dict(status="xml-parse-error") msg = dict( sha1hex=file_meta["sha1hex"], status="success", |