diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-04-06 15:04:47 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-04-06 15:04:47 -0700 |
commit | be1eeae2a30907752729a8dd161d0beb2afd9e52 (patch) | |
tree | abba00713f7b63c15a46950380d6ea6b4aff2a59 | |
parent | 783c7559f1c2096891272271f6ed17795e2c63e0 (diff) | |
download | sandcrawler-be1eeae2a30907752729a8dd161d0beb2afd9e52.tar.gz sandcrawler-be1eeae2a30907752729a8dd161d0beb2afd9e52.zip |
renamed do_tei
-rwxr-xr-x | mapreduce/extraction_cdx_grobid.py | 6 |
1 files changed, 3 insertions, 3 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py index d1ec840..8fa1720 100755 --- a/mapreduce/extraction_cdx_grobid.py +++ b/mapreduce/extraction_cdx_grobid.py @@ -30,7 +30,7 @@ from wayback.resource import ArcResource from wayback.resourcestore import ResourceStore from gwb.loader import CDXLoaderFactory from common import parse_cdx_line -from grobid2json import do_tei +from grobid2json import teixml2json class MRExtractCdxGrobid(MRJob): @@ -147,8 +147,8 @@ class MRExtractCdxGrobid(MRJob): # Convert TEI XML to JSON # TODO: - info['grobid0:tei_json'] = do_tei(grobid_response.content, encumbered=True) - info['grobid0:metadata'] = do_tei(grobid_response.content, encumbered=False) + info['grobid0:tei_json'] = teixml2json(grobid_response.content, encumbered=True) + info['grobid0:metadata'] = teixml2json(grobid_response.content, encumbered=False) # Determine extraction "quality" # TODO: |