aboutsummaryrefslogtreecommitdiffstats
path: root/mapreduce/extraction_cdx_grobid.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-04-10 19:37:04 -0700
committerBryan Newbold <bnewbold@archive.org>2018-04-10 19:37:04 -0700
commit41da591130b464e36d0b91d35796026b2d7c4088 (patch)
treee4600d50c0f536070fbb454a6caf86bed26a9521 /mapreduce/extraction_cdx_grobid.py
parenta0be9706997182b18e48000375c462856aafc5ef (diff)
downloadsandcrawler-41da591130b464e36d0b91d35796026b2d7c4088.tar.gz
sandcrawler-41da591130b464e36d0b91d35796026b2d7c4088.zip
cleanup tests; add one for double-processing
Diffstat (limited to 'mapreduce/extraction_cdx_grobid.py')
-rwxr-xr-xmapreduce/extraction_cdx_grobid.py10
1 files changed, 5 insertions, 5 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py
index 708e170..db36cac 100755
--- a/mapreduce/extraction_cdx_grobid.py
+++ b/mapreduce/extraction_cdx_grobid.py
@@ -143,7 +143,7 @@ class MRExtractCdxGrobid(MRJob):
return info, dict(status="error", reason="non-200 GROBID HTTP status",
extra=grobid_response.content)
- info['grobid0:status'] = {}
+ info['grobid0:status'] = {'status': 'success'}
info['grobid0:tei_xml'] = grobid_response.content
# Convert TEI XML to JSON
@@ -189,9 +189,9 @@ class MRExtractCdxGrobid(MRJob):
key = info['key']
# Check if we've already processed this line
- oldrow = self.hb_table.row(key, columns=['f', 'file',
- 'grobid0:status_code'])
- if oldrow.get('grobid0:status', None):
+ oldrow = self.hb_table.row(key, columns=[b'f', b'file',
+ b'grobid0:status_code'])
+ if oldrow.get(b'grobid0:status_code', None) != None:
# This file has already been processed; skip it
self.increment_counter('lines', 'existing')
yield _, dict(status="existing", key=key)
@@ -209,7 +209,7 @@ class MRExtractCdxGrobid(MRJob):
# Particularly: ('f:c', 'file:mime', 'file:size', 'file:cdx')
grobid_status = info.get('grobid0:status_code', None)
for k in list(info.keys()):
- if k in oldrow:
+ if k.encode('utf-8') in oldrow:
info.pop(k)
# Convert fields to binary