diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-04-10 19:37:04 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-04-10 19:37:04 -0700 |
commit | 41da591130b464e36d0b91d35796026b2d7c4088 (patch) | |
tree | e4600d50c0f536070fbb454a6caf86bed26a9521 /mapreduce/extraction_cdx_grobid.py | |
parent | a0be9706997182b18e48000375c462856aafc5ef (diff) | |
download | sandcrawler-41da591130b464e36d0b91d35796026b2d7c4088.tar.gz sandcrawler-41da591130b464e36d0b91d35796026b2d7c4088.zip |
cleanup tests; add one for double-processing
Diffstat (limited to 'mapreduce/extraction_cdx_grobid.py')
-rwxr-xr-x | mapreduce/extraction_cdx_grobid.py | 10 |
1 files changed, 5 insertions, 5 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py index 708e170..db36cac 100755 --- a/mapreduce/extraction_cdx_grobid.py +++ b/mapreduce/extraction_cdx_grobid.py @@ -143,7 +143,7 @@ class MRExtractCdxGrobid(MRJob): return info, dict(status="error", reason="non-200 GROBID HTTP status", extra=grobid_response.content) - info['grobid0:status'] = {} + info['grobid0:status'] = {'status': 'success'} info['grobid0:tei_xml'] = grobid_response.content # Convert TEI XML to JSON @@ -189,9 +189,9 @@ class MRExtractCdxGrobid(MRJob): key = info['key'] # Check if we've already processed this line - oldrow = self.hb_table.row(key, columns=['f', 'file', - 'grobid0:status_code']) - if oldrow.get('grobid0:status', None): + oldrow = self.hb_table.row(key, columns=[b'f', b'file', + b'grobid0:status_code']) + if oldrow.get(b'grobid0:status_code', None) != None: # This file has already been processed; skip it self.increment_counter('lines', 'existing') yield _, dict(status="existing", key=key) @@ -209,7 +209,7 @@ class MRExtractCdxGrobid(MRJob): # Particularly: ('f:c', 'file:mime', 'file:size', 'file:cdx') grobid_status = info.get('grobid0:status_code', None) for k in list(info.keys()): - if k in oldrow: + if k.encode('utf-8') in oldrow: info.pop(k) # Convert fields to binary |