diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-04-04 11:52:55 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-04-04 11:52:55 -0700 |
commit | 78caa0d7772375903194e79df16d70d831ebd432 (patch) | |
tree | f77d566a3f8fe0506bd2a9696b3656e7e5a44c59 /extraction/extraction_cdx_grobid.py | |
parent | fcbcd6d13fd6506980dbcb60f0d7fa2112b19b86 (diff) | |
download | sandcrawler-78caa0d7772375903194e79df16d70d831ebd432.tar.gz sandcrawler-78caa0d7772375903194e79df16d70d831ebd432.zip |
pep8
Diffstat (limited to 'extraction/extraction_cdx_grobid.py')
-rwxr-xr-x | extraction/extraction_cdx_grobid.py | 4 |
1 files changed, 2 insertions, 2 deletions
diff --git a/extraction/extraction_cdx_grobid.py b/extraction/extraction_cdx_grobid.py index 54d8b71..27668ea 100755 --- a/extraction/extraction_cdx_grobid.py +++ b/extraction/extraction_cdx_grobid.py @@ -147,7 +147,7 @@ class MRExtractCdxGrobid(MrJob): # Fetch data from WARCs in petabox try: - rstore = ResourceStore(loaderfactory=CDXLoaderFactory()) + rstore = ResourceStore(loaderfactory=CDXLoaderFactory()) gwb_record = rstore.load_resource( info['file:cdx']['warc'], info['file:cdx']['offset'], @@ -219,7 +219,7 @@ class MRExtractCdxGrobid(MrJob): if info is None: self.increment_counter('lines', status['status']) return _, status - + # Decide what to bother inserting back into HBase # Particularly: ('f:c', 'file:mime', 'file:size', 'file:cdx') grobid_status = info.get('grobid0:status_code', None) |