From 43f4a6ec3895f4ac5a7db0dfa237aed44f52358b Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 11 Apr 2018 05:33:53 +0000 Subject: use NLineInputFormat so we can control split size --- mapreduce/extraction_cdx_grobid.py | 1 + 1 file changed, 1 insertion(+) (limited to 'mapreduce/extraction_cdx_grobid.py') diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py index f6c1ec1..6690f49 100755 --- a/mapreduce/extraction_cdx_grobid.py +++ b/mapreduce/extraction_cdx_grobid.py @@ -40,6 +40,7 @@ sentry_client = raven.Client() class MRExtractCdxGrobid(MRJob): # CDX lines in; JSON status out + HADOOP_INPUT_FORMAT = 'org.apache.hadoop.mapred.lib.NLineInputFormat' INPUT_PROTOCOL = mrjob.protocol.RawValueProtocol OUTPUT_PROTOCOL = mrjob.protocol.JSONValueProtocol -- cgit v1.2.3