diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2018-08-25 15:09:57 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2018-08-25 15:09:57 -0700 | 
| commit | 1134015c58d368013eb6ec9e7c0a68f3ad00077a (patch) | |
| tree | 8e87a884aef1a0b7087d3370e03a4d146db70e2c | |
| parent | 52c31b60eaa0d7153dad8a7f3ae2fdebcf383e66 (diff) | |
| download | sandcrawler-1134015c58d368013eb6ec9e7c0a68f3ad00077a.tar.gz sandcrawler-1134015c58d368013eb6ec9e7c0a68f3ad00077a.zip  | |
disambiguration parse_line method
| -rwxr-xr-x | python/extraction_ungrobided.py | 6 | 
1 files changed, 3 insertions, 3 deletions
diff --git a/python/extraction_ungrobided.py b/python/extraction_ungrobided.py index aedf715..74644e0 100755 --- a/python/extraction_ungrobided.py +++ b/python/extraction_ungrobided.py @@ -29,13 +29,13 @@ from extraction_cdx_grobid import MRExtractCdxGrobid, KEY_BLACKLIST, \  class MRExtractUnGrobided(MRExtractCdxGrobid): -    # CDX lines in; JSON status out +    # "ungrobided" TSV lines in; JSON status out      #HADOOP_INPUT_FORMAT = 'org.apache.hadoop.mapred.lib.NLineInputFormat'      #INPUT_PROTOCOL = mrjob.protocol.RawProtocol      INPUT_PROTOCOL = mrjob.protocol.RawValueProtocol      OUTPUT_PROTOCOL = mrjob.protocol.JSONValueProtocol -    def parse_line(self, raw_line): +    def parse_ungrobided_line(self, raw_line):          """Line should be TSV and have non-null fields:              - key (string) @@ -74,7 +74,7 @@ class MRExtractUnGrobided(MRExtractCdxGrobid):          self.increment_counter('lines', 'total')          # Parse line and filter down -        info, status = self.parse_line(raw_line) +        info, status = self.parse_ungrobided_line(raw_line)          if info is None:              self.increment_counter('lines', status['status'])              yield _, status  | 
