From 1134015c58d368013eb6ec9e7c0a68f3ad00077a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sat, 25 Aug 2018 15:09:57 -0700 Subject: disambiguration parse_line method --- python/extraction_ungrobided.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'python') diff --git a/python/extraction_ungrobided.py b/python/extraction_ungrobided.py index aedf715..74644e0 100755 --- a/python/extraction_ungrobided.py +++ b/python/extraction_ungrobided.py @@ -29,13 +29,13 @@ from extraction_cdx_grobid import MRExtractCdxGrobid, KEY_BLACKLIST, \ class MRExtractUnGrobided(MRExtractCdxGrobid): - # CDX lines in; JSON status out + # "ungrobided" TSV lines in; JSON status out #HADOOP_INPUT_FORMAT = 'org.apache.hadoop.mapred.lib.NLineInputFormat' #INPUT_PROTOCOL = mrjob.protocol.RawProtocol INPUT_PROTOCOL = mrjob.protocol.RawValueProtocol OUTPUT_PROTOCOL = mrjob.protocol.JSONValueProtocol - def parse_line(self, raw_line): + def parse_ungrobided_line(self, raw_line): """Line should be TSV and have non-null fields: - key (string) @@ -74,7 +74,7 @@ class MRExtractUnGrobided(MRExtractCdxGrobid): self.increment_counter('lines', 'total') # Parse line and filter down - info, status = self.parse_line(raw_line) + info, status = self.parse_ungrobided_line(raw_line) if info is None: self.increment_counter('lines', status['status']) yield _, status -- cgit v1.2.3