diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-08-21 16:28:53 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-08-21 16:28:55 -0700 |
commit | 6c92ee4c0b137c28abd03ed72190210da8a1e72b (patch) | |
tree | 525b1f8992c201ded982fcbf70a9340ef86bdae0 /mapreduce | |
parent | 139ca7e5a90d49c33e23de781b7e4ac21e868fac (diff) | |
download | sandcrawler-6c92ee4c0b137c28abd03ed72190210da8a1e72b.tar.gz sandcrawler-6c92ee4c0b137c28abd03ed72190210da8a1e72b.zip |
extraction: do want content, not text
XML can have non-unicode characters? Who knew.
Diffstat (limited to 'mapreduce')
-rwxr-xr-x | mapreduce/extraction_cdx_grobid.py | 2 |
1 files changed, 1 insertions, 1 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py index 227a026..040538c 100755 --- a/mapreduce/extraction_cdx_grobid.py +++ b/mapreduce/extraction_cdx_grobid.py @@ -163,7 +163,7 @@ class MRExtractCdxGrobid(MRJob): extra=grobid_response.text) info['grobid0:status'] = {'status': 'partial'} - info['grobid0:tei_xml'] = grobid_response.text + info['grobid0:tei_xml'] = grobid_response.content # Convert TEI XML to JSON try: |