diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-04-11 04:02:30 +0000 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-04-11 04:02:30 +0000 |
commit | 59f1ae734b1599507497cad185d7f3ef89bf0ca3 (patch) | |
tree | ea96f658bd6a7399e2aac7cd7ab3444f4b356ced /mapreduce | |
parent | e495a75fae486fc31fded33cf83b577518361d05 (diff) | |
download | sandcrawler-59f1ae734b1599507497cad185d7f3ef89bf0ca3.tar.gz sandcrawler-59f1ae734b1599507497cad185d7f3ef89bf0ca3.zip |
don't try to decode GROBID output
Diffstat (limited to 'mapreduce')
-rwxr-xr-x | mapreduce/extraction_cdx_grobid.py | 4 |
1 files changed, 2 insertions, 2 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py index 0812884..e14b925 100755 --- a/mapreduce/extraction_cdx_grobid.py +++ b/mapreduce/extraction_cdx_grobid.py @@ -138,9 +138,9 @@ class MRExtractCdxGrobid(MRJob): info['grobid0:status_code'] = grobid_response.status_code if grobid_response.status_code != 200: # response.text is .content decoded as utf-8 - info['grobid0:status'] = json.loads(grobid_response.text) + info['grobid0:status'] = dict(description=grobid_response.text) return info, dict(status="error", reason="non-200 GROBID HTTP status", - extra=grobid_response.content) + extra=grobid_response.text) info['grobid0:status'] = {'status': 'success'} info['grobid0:tei_xml'] = grobid_response.content |