aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-08-25 15:09:57 -0700
committerBryan Newbold <bnewbold@archive.org>2018-08-25 15:09:57 -0700
commit1134015c58d368013eb6ec9e7c0a68f3ad00077a (patch)
tree8e87a884aef1a0b7087d3370e03a4d146db70e2c /python
parent52c31b60eaa0d7153dad8a7f3ae2fdebcf383e66 (diff)
downloadsandcrawler-1134015c58d368013eb6ec9e7c0a68f3ad00077a.tar.gz
sandcrawler-1134015c58d368013eb6ec9e7c0a68f3ad00077a.zip
disambiguration parse_line method
Diffstat (limited to 'python')
-rwxr-xr-xpython/extraction_ungrobided.py6
1 files changed, 3 insertions, 3 deletions
diff --git a/python/extraction_ungrobided.py b/python/extraction_ungrobided.py
index aedf715..74644e0 100755
--- a/python/extraction_ungrobided.py
+++ b/python/extraction_ungrobided.py
@@ -29,13 +29,13 @@ from extraction_cdx_grobid import MRExtractCdxGrobid, KEY_BLACKLIST, \
class MRExtractUnGrobided(MRExtractCdxGrobid):
- # CDX lines in; JSON status out
+ # "ungrobided" TSV lines in; JSON status out
#HADOOP_INPUT_FORMAT = 'org.apache.hadoop.mapred.lib.NLineInputFormat'
#INPUT_PROTOCOL = mrjob.protocol.RawProtocol
INPUT_PROTOCOL = mrjob.protocol.RawValueProtocol
OUTPUT_PROTOCOL = mrjob.protocol.JSONValueProtocol
- def parse_line(self, raw_line):
+ def parse_ungrobided_line(self, raw_line):
"""Line should be TSV and have non-null fields:
- key (string)
@@ -74,7 +74,7 @@ class MRExtractUnGrobided(MRExtractCdxGrobid):
self.increment_counter('lines', 'total')
# Parse line and filter down
- info, status = self.parse_line(raw_line)
+ info, status = self.parse_ungrobided_line(raw_line)
if info is None:
self.increment_counter('lines', status['status'])
yield _, status