aboutsummaryrefslogtreecommitdiffstats
path: root/mapreduce
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-04-15 05:44:59 +0000
committerBryan Newbold <bnewbold@archive.org>2018-04-19 05:15:02 +0000
commite0d1e381bf536d1c077546526c21eab909444193 (patch)
tree1665ce66a95981881fea58dee3f13f8b73eb55a3 /mapreduce
parenta8a568f03d7f537a8683adf23f6643c7704e8d3d (diff)
downloadsandcrawler-e0d1e381bf536d1c077546526c21eab909444193.tar.gz
sandcrawler-e0d1e381bf536d1c077546526c21eab909444193.zip
NLineInputFormat requires RawProtocol
Should make this a command line argument or something. Want one in HADOOP, the other for local/tests/inline/etc.
Diffstat (limited to 'mapreduce')
-rwxr-xr-xmapreduce/extraction_cdx_grobid.py3
1 files changed, 2 insertions, 1 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py
index 6690f49..e23950c 100755
--- a/mapreduce/extraction_cdx_grobid.py
+++ b/mapreduce/extraction_cdx_grobid.py
@@ -40,7 +40,8 @@ sentry_client = raven.Client()
class MRExtractCdxGrobid(MRJob):
# CDX lines in; JSON status out
- HADOOP_INPUT_FORMAT = 'org.apache.hadoop.mapred.lib.NLineInputFormat'
+ #HADOOP_INPUT_FORMAT = 'org.apache.hadoop.mapred.lib.NLineInputFormat'
+ #INPUT_PROTOCOL = mrjob.protocol.RawProtocol
INPUT_PROTOCOL = mrjob.protocol.RawValueProtocol
OUTPUT_PROTOCOL = mrjob.protocol.JSONValueProtocol