From 8713b2875c01efd438584c1e17c1c459682b04ea Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 21 Nov 2018 17:24:22 -0800 Subject: kafka_grobid fixes and hbase WIP --- python/kafka_grobid.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'python/kafka_grobid.py') diff --git a/python/kafka_grobid.py b/python/kafka_grobid.py index e57ace6..ada7264 100755 --- a/python/kafka_grobid.py +++ b/python/kafka_grobid.py @@ -159,7 +159,7 @@ class KafkaGrobidWorker: info['grobid0:tei_xml'] = grobid_response.content info['grobid0:status'] = {'status': 'success'} - return info + return info, None def do_work(self, raw_line): """ @@ -199,6 +199,10 @@ class KafkaGrobidWorker: return None, status extraction_status = status + # Need to encode 'bytes' as 'str' for JSON serialization + if info.get('grobid0:tei_xml'): + info['grobid0:tei_xml'] = info['grobid0:tei_xml'].decode('utf-8') + #self.increment_counter('lines', 'success') grobid_status_code = info.get('grobid0:status_code', None) @@ -236,7 +240,7 @@ class KafkaGrobidWorker: print("got a line! ") grobid_output, status = self.do_work(msg.value.decode('utf-8')) if grobid_output: - producer.produce(json.dumps(work).encode('utf-8')) + producer.produce(json.dumps(grobid_output).encode('utf-8')) sequential_failures = 0 else: print("failed to extract: {}".format(status)) -- cgit v1.2.3