aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-11-21 17:53:32 -0800
committerBryan Newbold <bnewbold@archive.org>2018-11-21 17:53:32 -0800
commit6591acdda8b09289fabfa913b2f6bb51642fd38f (patch)
tree99a0e373d21f0a3380d86ac31d4ead21e908deb4
parent3054e1ce5395c70d75c7750a77ba8a49648bc504 (diff)
downloadsandcrawler-6591acdda8b09289fabfa913b2f6bb51642fd38f.tar.gz
sandcrawler-6591acdda8b09289fabfa913b2f6bb51642fd38f.zip
cherry-pick: correct HBase column filtering
-rwxr-xr-xpython/extraction_ungrobided.py2
1 files changed, 1 insertions, 1 deletions
diff --git a/python/extraction_ungrobided.py b/python/extraction_ungrobided.py
index af38cea..4b558dd 100755
--- a/python/extraction_ungrobided.py
+++ b/python/extraction_ungrobided.py
@@ -242,7 +242,7 @@ class MRExtractUnGrobided(MRJob):
# Basically, don't overwrite backfill fields.
grobid_status_code = info.get('grobid0:status_code', None)
for k in list(info.keys()):
- if k.encode('utf-8') in ('f:c', 'file:mime', 'file:cdx'):
+ if k in ('f:c', 'file:mime', 'file:cdx'):
info.pop(k)
# Convert fields to binary