aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/persist.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-02 18:05:06 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-02 18:12:59 -0800
commit629290b5713f4d33688fdbda614707d82898b0af (patch)
tree2e85b112a391fbf9bdeffa196eb5f3cd34ea05ea /python/sandcrawler/persist.py
parent5dc1a8642077b67f3af0a41cdac851bb96a435b7 (diff)
downloadsandcrawler-629290b5713f4d33688fdbda614707d82898b0af.tar.gz
sandcrawler-629290b5713f4d33688fdbda614707d82898b0af.zip
handle grobid2json errors in calling code instead
Diffstat (limited to 'python/sandcrawler/persist.py')
-rw-r--r--python/sandcrawler/persist.py8
1 files changed, 7 insertions, 1 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 71ada51..c24dec8 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -20,6 +20,7 @@ grobid
"""
import os
+import xml.etree.ElementTree
from sandcrawler.workers import SandcrawlerWorker
from sandcrawler.db import SandcrawlerPostgresClient
@@ -230,7 +231,12 @@ class PersistGrobidWorker(SandcrawlerWorker):
self.counts['s3-put'] += 1
# enhance with teixml2json metadata, if available
- metadata = self.grobid.metadata(r)
+ try:
+ metadata = self.grobid.metadata(r)
+ except xml.etree.ElementTree.ParseError as xml_e:
+ r['status'] = 'bad-grobid-xml'
+ r['metadata'] = {'error_msg': str(xml_e)[:1024]}
+ continue
if not metadata:
continue
for k in ('fatcat_release', 'grobid_version'):