aboutsummaryrefslogtreecommitdiffstats
path: root/extraction
diff options
context:
space:
mode:
Diffstat (limited to 'extraction')
-rwxr-xr-xextraction/extraction_cdx_grobid.py4
-rwxr-xr-xextraction/grobid2json.py2
2 files changed, 3 insertions, 3 deletions
diff --git a/extraction/extraction_cdx_grobid.py b/extraction/extraction_cdx_grobid.py
index 54d8b71..27668ea 100755
--- a/extraction/extraction_cdx_grobid.py
+++ b/extraction/extraction_cdx_grobid.py
@@ -147,7 +147,7 @@ class MRExtractCdxGrobid(MrJob):
# Fetch data from WARCs in petabox
try:
- rstore = ResourceStore(loaderfactory=CDXLoaderFactory())
+ rstore = ResourceStore(loaderfactory=CDXLoaderFactory())
gwb_record = rstore.load_resource(
info['file:cdx']['warc'],
info['file:cdx']['offset'],
@@ -219,7 +219,7 @@ class MRExtractCdxGrobid(MrJob):
if info is None:
self.increment_counter('lines', status['status'])
return _, status
-
+
# Decide what to bother inserting back into HBase
# Particularly: ('f:c', 'file:mime', 'file:size', 'file:cdx')
grobid_status = info.get('grobid0:status_code', None)
diff --git a/extraction/grobid2json.py b/extraction/grobid2json.py
index 874ac8b..daf9387 100755
--- a/extraction/grobid2json.py
+++ b/extraction/grobid2json.py
@@ -107,7 +107,7 @@ def do_tei(path, encumbered=True):
el = tei.find('.//{%s}back/{%s}div[@type="annex"]' % (ns, ns))
info['annex'] = (el or None) and " ".join(el.itertext()).strip()
- return info
+ return info
def main():
parser = argparse.ArgumentParser(