aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-04-04 11:52:55 -0700
committerBryan Newbold <bnewbold@archive.org>2018-04-04 11:52:55 -0700
commit78caa0d7772375903194e79df16d70d831ebd432 (patch)
treef77d566a3f8fe0506bd2a9696b3656e7e5a44c59
parentfcbcd6d13fd6506980dbcb60f0d7fa2112b19b86 (diff)
downloadsandcrawler-78caa0d7772375903194e79df16d70d831ebd432.tar.gz
sandcrawler-78caa0d7772375903194e79df16d70d831ebd432.zip
pep8
-rwxr-xr-xextraction/extraction_cdx_grobid.py4
-rwxr-xr-xextraction/grobid2json.py2
2 files changed, 3 insertions, 3 deletions
diff --git a/extraction/extraction_cdx_grobid.py b/extraction/extraction_cdx_grobid.py
index 54d8b71..27668ea 100755
--- a/extraction/extraction_cdx_grobid.py
+++ b/extraction/extraction_cdx_grobid.py
@@ -147,7 +147,7 @@ class MRExtractCdxGrobid(MrJob):
# Fetch data from WARCs in petabox
try:
- rstore = ResourceStore(loaderfactory=CDXLoaderFactory())
+ rstore = ResourceStore(loaderfactory=CDXLoaderFactory())
gwb_record = rstore.load_resource(
info['file:cdx']['warc'],
info['file:cdx']['offset'],
@@ -219,7 +219,7 @@ class MRExtractCdxGrobid(MrJob):
if info is None:
self.increment_counter('lines', status['status'])
return _, status
-
+
# Decide what to bother inserting back into HBase
# Particularly: ('f:c', 'file:mime', 'file:size', 'file:cdx')
grobid_status = info.get('grobid0:status_code', None)
diff --git a/extraction/grobid2json.py b/extraction/grobid2json.py
index 874ac8b..daf9387 100755
--- a/extraction/grobid2json.py
+++ b/extraction/grobid2json.py
@@ -107,7 +107,7 @@ def do_tei(path, encumbered=True):
el = tei.find('.//{%s}back/{%s}div[@type="annex"]' % (ns, ns))
info['annex'] = (el or None) and " ".join(el.itertext()).strip()
- return info
+ return info
def main():
parser = argparse.ArgumentParser(