aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-04-06 15:04:47 -0700
committerBryan Newbold <bnewbold@archive.org>2018-04-06 15:04:47 -0700
commitbe1eeae2a30907752729a8dd161d0beb2afd9e52 (patch)
treeabba00713f7b63c15a46950380d6ea6b4aff2a59
parent783c7559f1c2096891272271f6ed17795e2c63e0 (diff)
downloadsandcrawler-be1eeae2a30907752729a8dd161d0beb2afd9e52.tar.gz
sandcrawler-be1eeae2a30907752729a8dd161d0beb2afd9e52.zip
renamed do_tei
-rwxr-xr-xmapreduce/extraction_cdx_grobid.py6
1 files changed, 3 insertions, 3 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py
index d1ec840..8fa1720 100755
--- a/mapreduce/extraction_cdx_grobid.py
+++ b/mapreduce/extraction_cdx_grobid.py
@@ -30,7 +30,7 @@ from wayback.resource import ArcResource
from wayback.resourcestore import ResourceStore
from gwb.loader import CDXLoaderFactory
from common import parse_cdx_line
-from grobid2json import do_tei
+from grobid2json import teixml2json
class MRExtractCdxGrobid(MRJob):
@@ -147,8 +147,8 @@ class MRExtractCdxGrobid(MRJob):
# Convert TEI XML to JSON
# TODO:
- info['grobid0:tei_json'] = do_tei(grobid_response.content, encumbered=True)
- info['grobid0:metadata'] = do_tei(grobid_response.content, encumbered=False)
+ info['grobid0:tei_json'] = teixml2json(grobid_response.content, encumbered=True)
+ info['grobid0:metadata'] = teixml2json(grobid_response.content, encumbered=False)
# Determine extraction "quality"
# TODO: