diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-04-15 05:44:59 +0000 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-04-19 05:15:02 +0000 |
commit | e0d1e381bf536d1c077546526c21eab909444193 (patch) | |
tree | 1665ce66a95981881fea58dee3f13f8b73eb55a3 | |
parent | a8a568f03d7f537a8683adf23f6643c7704e8d3d (diff) | |
download | sandcrawler-e0d1e381bf536d1c077546526c21eab909444193.tar.gz sandcrawler-e0d1e381bf536d1c077546526c21eab909444193.zip |
NLineInputFormat requires RawProtocol
Should make this a command line argument or something. Want one in
HADOOP, the other for local/tests/inline/etc.
-rwxr-xr-x | mapreduce/extraction_cdx_grobid.py | 3 |
1 files changed, 2 insertions, 1 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py index 6690f49..e23950c 100755 --- a/mapreduce/extraction_cdx_grobid.py +++ b/mapreduce/extraction_cdx_grobid.py @@ -40,7 +40,8 @@ sentry_client = raven.Client() class MRExtractCdxGrobid(MRJob): # CDX lines in; JSON status out - HADOOP_INPUT_FORMAT = 'org.apache.hadoop.mapred.lib.NLineInputFormat' + #HADOOP_INPUT_FORMAT = 'org.apache.hadoop.mapred.lib.NLineInputFormat' + #INPUT_PROTOCOL = mrjob.protocol.RawProtocol INPUT_PROTOCOL = mrjob.protocol.RawValueProtocol OUTPUT_PROTOCOL = mrjob.protocol.JSONValueProtocol |