diff options
Diffstat (limited to 'mapreduce/extraction_cdx_grobid.py')
-rwxr-xr-x | mapreduce/extraction_cdx_grobid.py | 6 |
1 files changed, 3 insertions, 3 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py index d1ec840..8fa1720 100755 --- a/mapreduce/extraction_cdx_grobid.py +++ b/mapreduce/extraction_cdx_grobid.py @@ -30,7 +30,7 @@ from wayback.resource import ArcResource from wayback.resourcestore import ResourceStore from gwb.loader import CDXLoaderFactory from common import parse_cdx_line -from grobid2json import do_tei +from grobid2json import teixml2json class MRExtractCdxGrobid(MRJob): @@ -147,8 +147,8 @@ class MRExtractCdxGrobid(MRJob): # Convert TEI XML to JSON # TODO: - info['grobid0:tei_json'] = do_tei(grobid_response.content, encumbered=True) - info['grobid0:metadata'] = do_tei(grobid_response.content, encumbered=False) + info['grobid0:tei_json'] = teixml2json(grobid_response.content, encumbered=True) + info['grobid0:metadata'] = teixml2json(grobid_response.content, encumbered=False) # Determine extraction "quality" # TODO: |