diff options
Diffstat (limited to 'extraction')
-rwxr-xr-x | extraction/extraction_cdx_grobid.py | 4 | ||||
-rwxr-xr-x | extraction/grobid2json.py | 2 |
2 files changed, 3 insertions, 3 deletions
diff --git a/extraction/extraction_cdx_grobid.py b/extraction/extraction_cdx_grobid.py index 54d8b71..27668ea 100755 --- a/extraction/extraction_cdx_grobid.py +++ b/extraction/extraction_cdx_grobid.py @@ -147,7 +147,7 @@ class MRExtractCdxGrobid(MrJob): # Fetch data from WARCs in petabox try: - rstore = ResourceStore(loaderfactory=CDXLoaderFactory()) + rstore = ResourceStore(loaderfactory=CDXLoaderFactory()) gwb_record = rstore.load_resource( info['file:cdx']['warc'], info['file:cdx']['offset'], @@ -219,7 +219,7 @@ class MRExtractCdxGrobid(MrJob): if info is None: self.increment_counter('lines', status['status']) return _, status - + # Decide what to bother inserting back into HBase # Particularly: ('f:c', 'file:mime', 'file:size', 'file:cdx') grobid_status = info.get('grobid0:status_code', None) diff --git a/extraction/grobid2json.py b/extraction/grobid2json.py index 874ac8b..daf9387 100755 --- a/extraction/grobid2json.py +++ b/extraction/grobid2json.py @@ -107,7 +107,7 @@ def do_tei(path, encumbered=True): el = tei.find('.//{%s}back/{%s}div[@type="annex"]' % (ns, ns)) info['annex'] = (el or None) and " ".join(el.itertext()).strip() - return info + return info def main(): parser = argparse.ArgumentParser( |