aboutsummaryrefslogtreecommitdiffstats
path: root/mapreduce
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-04-11 04:02:30 +0000
committerBryan Newbold <bnewbold@archive.org>2018-04-11 04:02:30 +0000
commit59f1ae734b1599507497cad185d7f3ef89bf0ca3 (patch)
treeea96f658bd6a7399e2aac7cd7ab3444f4b356ced /mapreduce
parente495a75fae486fc31fded33cf83b577518361d05 (diff)
downloadsandcrawler-59f1ae734b1599507497cad185d7f3ef89bf0ca3.tar.gz
sandcrawler-59f1ae734b1599507497cad185d7f3ef89bf0ca3.zip
don't try to decode GROBID output
Diffstat (limited to 'mapreduce')
-rwxr-xr-xmapreduce/extraction_cdx_grobid.py4
1 files changed, 2 insertions, 2 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py
index 0812884..e14b925 100755
--- a/mapreduce/extraction_cdx_grobid.py
+++ b/mapreduce/extraction_cdx_grobid.py
@@ -138,9 +138,9 @@ class MRExtractCdxGrobid(MRJob):
info['grobid0:status_code'] = grobid_response.status_code
if grobid_response.status_code != 200:
# response.text is .content decoded as utf-8
- info['grobid0:status'] = json.loads(grobid_response.text)
+ info['grobid0:status'] = dict(description=grobid_response.text)
return info, dict(status="error", reason="non-200 GROBID HTTP status",
- extra=grobid_response.content)
+ extra=grobid_response.text)
info['grobid0:status'] = {'status': 'success'}
info['grobid0:tei_xml'] = grobid_response.content