diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-08-25 15:09:57 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-08-25 15:09:57 -0700 |
commit | 1134015c58d368013eb6ec9e7c0a68f3ad00077a (patch) | |
tree | 8e87a884aef1a0b7087d3370e03a4d146db70e2c | |
parent | 52c31b60eaa0d7153dad8a7f3ae2fdebcf383e66 (diff) | |
download | sandcrawler-1134015c58d368013eb6ec9e7c0a68f3ad00077a.tar.gz sandcrawler-1134015c58d368013eb6ec9e7c0a68f3ad00077a.zip |
disambiguration parse_line method
-rwxr-xr-x | python/extraction_ungrobided.py | 6 |
1 files changed, 3 insertions, 3 deletions
diff --git a/python/extraction_ungrobided.py b/python/extraction_ungrobided.py index aedf715..74644e0 100755 --- a/python/extraction_ungrobided.py +++ b/python/extraction_ungrobided.py @@ -29,13 +29,13 @@ from extraction_cdx_grobid import MRExtractCdxGrobid, KEY_BLACKLIST, \ class MRExtractUnGrobided(MRExtractCdxGrobid): - # CDX lines in; JSON status out + # "ungrobided" TSV lines in; JSON status out #HADOOP_INPUT_FORMAT = 'org.apache.hadoop.mapred.lib.NLineInputFormat' #INPUT_PROTOCOL = mrjob.protocol.RawProtocol INPUT_PROTOCOL = mrjob.protocol.RawValueProtocol OUTPUT_PROTOCOL = mrjob.protocol.JSONValueProtocol - def parse_line(self, raw_line): + def parse_ungrobided_line(self, raw_line): """Line should be TSV and have non-null fields: - key (string) @@ -74,7 +74,7 @@ class MRExtractUnGrobided(MRExtractCdxGrobid): self.increment_counter('lines', 'total') # Parse line and filter down - info, status = self.parse_line(raw_line) + info, status = self.parse_ungrobided_line(raw_line) if info is None: self.increment_counter('lines', status['status']) yield _, status |