aboutsummaryrefslogtreecommitdiffstats
path: root/python/kafka_grobid.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-11-21 17:24:22 -0800
committerBryan Newbold <bnewbold@archive.org>2018-11-21 17:24:22 -0800
commit8713b2875c01efd438584c1e17c1c459682b04ea (patch)
treecbdfcf05fdefcd59626e35d3818751a88489c0f7 /python/kafka_grobid.py
parent9e7c651806645b1e59b07f354ce7cdece17d76b7 (diff)
downloadsandcrawler-8713b2875c01efd438584c1e17c1c459682b04ea.tar.gz
sandcrawler-8713b2875c01efd438584c1e17c1c459682b04ea.zip
kafka_grobid fixes and hbase WIP
Diffstat (limited to 'python/kafka_grobid.py')
-rwxr-xr-xpython/kafka_grobid.py8
1 files changed, 6 insertions, 2 deletions
diff --git a/python/kafka_grobid.py b/python/kafka_grobid.py
index e57ace6..ada7264 100755
--- a/python/kafka_grobid.py
+++ b/python/kafka_grobid.py
@@ -159,7 +159,7 @@ class KafkaGrobidWorker:
info['grobid0:tei_xml'] = grobid_response.content
info['grobid0:status'] = {'status': 'success'}
- return info
+ return info, None
def do_work(self, raw_line):
"""
@@ -199,6 +199,10 @@ class KafkaGrobidWorker:
return None, status
extraction_status = status
+ # Need to encode 'bytes' as 'str' for JSON serialization
+ if info.get('grobid0:tei_xml'):
+ info['grobid0:tei_xml'] = info['grobid0:tei_xml'].decode('utf-8')
+
#self.increment_counter('lines', 'success')
grobid_status_code = info.get('grobid0:status_code', None)
@@ -236,7 +240,7 @@ class KafkaGrobidWorker:
print("got a line! ")
grobid_output, status = self.do_work(msg.value.decode('utf-8'))
if grobid_output:
- producer.produce(json.dumps(work).encode('utf-8'))
+ producer.produce(json.dumps(grobid_output).encode('utf-8'))
sequential_failures = 0
else:
print("failed to extract: {}".format(status))