aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-08-21 16:28:53 -0700
committerBryan Newbold <bnewbold@archive.org>2018-08-21 16:28:55 -0700
commit6c92ee4c0b137c28abd03ed72190210da8a1e72b (patch)
tree525b1f8992c201ded982fcbf70a9340ef86bdae0
parent139ca7e5a90d49c33e23de781b7e4ac21e868fac (diff)
downloadsandcrawler-6c92ee4c0b137c28abd03ed72190210da8a1e72b.tar.gz
sandcrawler-6c92ee4c0b137c28abd03ed72190210da8a1e72b.zip
extraction: do want content, not text
XML can have non-unicode characters? Who knew.
-rwxr-xr-xmapreduce/extraction_cdx_grobid.py2
1 files changed, 1 insertions, 1 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py
index 227a026..040538c 100755
--- a/mapreduce/extraction_cdx_grobid.py
+++ b/mapreduce/extraction_cdx_grobid.py
@@ -163,7 +163,7 @@ class MRExtractCdxGrobid(MRJob):
extra=grobid_response.text)
info['grobid0:status'] = {'status': 'partial'}
- info['grobid0:tei_xml'] = grobid_response.text
+ info['grobid0:tei_xml'] = grobid_response.content
# Convert TEI XML to JSON
try: