diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-04-11 05:33:53 +0000 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-04-11 05:33:53 +0000 |
commit | 43f4a6ec3895f4ac5a7db0dfa237aed44f52358b (patch) | |
tree | f06585e20b879f7350f3a0f9fa08bf67d07c251e | |
parent | be1704a419a1e916bb0055e2b40d2db026976001 (diff) | |
download | sandcrawler-43f4a6ec3895f4ac5a7db0dfa237aed44f52358b.tar.gz sandcrawler-43f4a6ec3895f4ac5a7db0dfa237aed44f52358b.zip |
use NLineInputFormat so we can control split size
-rwxr-xr-x | mapreduce/extraction_cdx_grobid.py | 1 |
1 files changed, 1 insertions, 0 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py index f6c1ec1..6690f49 100755 --- a/mapreduce/extraction_cdx_grobid.py +++ b/mapreduce/extraction_cdx_grobid.py @@ -40,6 +40,7 @@ sentry_client = raven.Client() class MRExtractCdxGrobid(MRJob): # CDX lines in; JSON status out + HADOOP_INPUT_FORMAT = 'org.apache.hadoop.mapred.lib.NLineInputFormat' INPUT_PROTOCOL = mrjob.protocol.RawValueProtocol OUTPUT_PROTOCOL = mrjob.protocol.JSONValueProtocol |