aboutsummaryrefslogtreecommitdiffstats
path: root/extraction/extraction_cdx_grobid.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-04-04 11:52:55 -0700
committerBryan Newbold <bnewbold@archive.org>2018-04-04 11:52:55 -0700
commit78caa0d7772375903194e79df16d70d831ebd432 (patch)
treef77d566a3f8fe0506bd2a9696b3656e7e5a44c59 /extraction/extraction_cdx_grobid.py
parentfcbcd6d13fd6506980dbcb60f0d7fa2112b19b86 (diff)
downloadsandcrawler-78caa0d7772375903194e79df16d70d831ebd432.tar.gz
sandcrawler-78caa0d7772375903194e79df16d70d831ebd432.zip
pep8
Diffstat (limited to 'extraction/extraction_cdx_grobid.py')
-rwxr-xr-xextraction/extraction_cdx_grobid.py4
1 files changed, 2 insertions, 2 deletions
diff --git a/extraction/extraction_cdx_grobid.py b/extraction/extraction_cdx_grobid.py
index 54d8b71..27668ea 100755
--- a/extraction/extraction_cdx_grobid.py
+++ b/extraction/extraction_cdx_grobid.py
@@ -147,7 +147,7 @@ class MRExtractCdxGrobid(MrJob):
# Fetch data from WARCs in petabox
try:
- rstore = ResourceStore(loaderfactory=CDXLoaderFactory())
+ rstore = ResourceStore(loaderfactory=CDXLoaderFactory())
gwb_record = rstore.load_resource(
info['file:cdx']['warc'],
info['file:cdx']['offset'],
@@ -219,7 +219,7 @@ class MRExtractCdxGrobid(MrJob):
if info is None:
self.increment_counter('lines', status['status'])
return _, status
-
+
# Decide what to bother inserting back into HBase
# Particularly: ('f:c', 'file:mime', 'file:size', 'file:cdx')
grobid_status = info.get('grobid0:status_code', None)