aboutsummaryrefslogtreecommitdiffstats
path: root/mapreduce
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-04-11 05:33:53 +0000
committerBryan Newbold <bnewbold@archive.org>2018-04-11 05:33:53 +0000
commit43f4a6ec3895f4ac5a7db0dfa237aed44f52358b (patch)
treef06585e20b879f7350f3a0f9fa08bf67d07c251e /mapreduce
parentbe1704a419a1e916bb0055e2b40d2db026976001 (diff)
downloadsandcrawler-43f4a6ec3895f4ac5a7db0dfa237aed44f52358b.tar.gz
sandcrawler-43f4a6ec3895f4ac5a7db0dfa237aed44f52358b.zip
use NLineInputFormat so we can control split size
Diffstat (limited to 'mapreduce')
-rwxr-xr-xmapreduce/extraction_cdx_grobid.py1
1 files changed, 1 insertions, 0 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py
index f6c1ec1..6690f49 100755
--- a/mapreduce/extraction_cdx_grobid.py
+++ b/mapreduce/extraction_cdx_grobid.py
@@ -40,6 +40,7 @@ sentry_client = raven.Client()
class MRExtractCdxGrobid(MRJob):
# CDX lines in; JSON status out
+ HADOOP_INPUT_FORMAT = 'org.apache.hadoop.mapred.lib.NLineInputFormat'
INPUT_PROTOCOL = mrjob.protocol.RawValueProtocol
OUTPUT_PROTOCOL = mrjob.protocol.JSONValueProtocol