From e0d1e381bf536d1c077546526c21eab909444193 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 15 Apr 2018 05:44:59 +0000 Subject: NLineInputFormat requires RawProtocol Should make this a command line argument or something. Want one in HADOOP, the other for local/tests/inline/etc. --- mapreduce/extraction_cdx_grobid.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py index 6690f49..e23950c 100755 --- a/mapreduce/extraction_cdx_grobid.py +++ b/mapreduce/extraction_cdx_grobid.py @@ -40,7 +40,8 @@ sentry_client = raven.Client() class MRExtractCdxGrobid(MRJob): # CDX lines in; JSON status out - HADOOP_INPUT_FORMAT = 'org.apache.hadoop.mapred.lib.NLineInputFormat' + #HADOOP_INPUT_FORMAT = 'org.apache.hadoop.mapred.lib.NLineInputFormat' + #INPUT_PROTOCOL = mrjob.protocol.RawProtocol INPUT_PROTOCOL = mrjob.protocol.RawValueProtocol OUTPUT_PROTOCOL = mrjob.protocol.JSONValueProtocol -- cgit v1.2.3