diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-04-04 11:52:55 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-04-04 11:52:55 -0700 |
commit | 78caa0d7772375903194e79df16d70d831ebd432 (patch) | |
tree | f77d566a3f8fe0506bd2a9696b3656e7e5a44c59 | |
parent | fcbcd6d13fd6506980dbcb60f0d7fa2112b19b86 (diff) | |
download | sandcrawler-78caa0d7772375903194e79df16d70d831ebd432.tar.gz sandcrawler-78caa0d7772375903194e79df16d70d831ebd432.zip |
pep8
-rwxr-xr-x | extraction/extraction_cdx_grobid.py | 4 | ||||
-rwxr-xr-x | extraction/grobid2json.py | 2 |
2 files changed, 3 insertions, 3 deletions
diff --git a/extraction/extraction_cdx_grobid.py b/extraction/extraction_cdx_grobid.py index 54d8b71..27668ea 100755 --- a/extraction/extraction_cdx_grobid.py +++ b/extraction/extraction_cdx_grobid.py @@ -147,7 +147,7 @@ class MRExtractCdxGrobid(MrJob): # Fetch data from WARCs in petabox try: - rstore = ResourceStore(loaderfactory=CDXLoaderFactory()) + rstore = ResourceStore(loaderfactory=CDXLoaderFactory()) gwb_record = rstore.load_resource( info['file:cdx']['warc'], info['file:cdx']['offset'], @@ -219,7 +219,7 @@ class MRExtractCdxGrobid(MrJob): if info is None: self.increment_counter('lines', status['status']) return _, status - + # Decide what to bother inserting back into HBase # Particularly: ('f:c', 'file:mime', 'file:size', 'file:cdx') grobid_status = info.get('grobid0:status_code', None) diff --git a/extraction/grobid2json.py b/extraction/grobid2json.py index 874ac8b..daf9387 100755 --- a/extraction/grobid2json.py +++ b/extraction/grobid2json.py @@ -107,7 +107,7 @@ def do_tei(path, encumbered=True): el = tei.find('.//{%s}back/{%s}div[@type="annex"]' % (ns, ns)) info['annex'] = (el or None) and " ".join(el.itertext()).strip() - return info + return info def main(): parser = argparse.ArgumentParser( |