diff options
Diffstat (limited to 'mapreduce/extraction_cdx_grobid.py')
-rwxr-xr-x | mapreduce/extraction_cdx_grobid.py | 14 |
1 files changed, 6 insertions, 8 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py index c29b27e..0812884 100755 --- a/mapreduce/extraction_cdx_grobid.py +++ b/mapreduce/extraction_cdx_grobid.py @@ -13,11 +13,10 @@ Requires: - wayback/GWB libraries """ -# XXX: some broken MRO thing going on in here +# XXX: some broken MRO thing going on in here due to python3 object wrangling +# in `wayback` library. Means we can't run pylint. # pylint: skip-file -import io -import sys import xml import json import struct @@ -80,7 +79,7 @@ class MRExtractCdxGrobid(MRJob): # TODO: make these configs accessible from... mrconf.cfg? hb_conn = happybase.Connection(host=host, transport="framed", protocol="compact") - except Exception as err: + except Exception: raise Exception("Couldn't connect to HBase using host: {}".format(host)) self.hb_table = hb_conn.table(self.options.hbase_table) @@ -108,7 +107,7 @@ class MRExtractCdxGrobid(MRJob): try: rstore = ResourceStore(loaderfactory=CDXLoaderFactory()) gwb_record = rstore.load_resource(warc_uri, offset, c_size) - except wayback.exception.ResourceUnavailable as err: + except wayback.exception.ResourceUnavailable: return None, dict(status="error", reason="failed to load file contents from wayback/petabox") @@ -137,7 +136,7 @@ class MRExtractCdxGrobid(MRJob): return None, dict(status="error", reason="connection to GROBID worker") info['grobid0:status_code'] = grobid_response.status_code - if grobid_response.status_code is not 200: + if grobid_response.status_code != 200: # response.text is .content decoded as utf-8 info['grobid0:status'] = json.loads(grobid_response.text) return info, dict(status="error", reason="non-200 GROBID HTTP status", @@ -215,7 +214,7 @@ class MRExtractCdxGrobid(MRJob): # Convert fields to binary for k in list(info.keys()): - if info[k] == None: + if info[k] is None: info.pop(k) elif k in ('f:c', 'file:cdx', 'grobid0:status', 'grobid0:tei_json', 'grobid0:metadata'): @@ -242,4 +241,3 @@ class MRExtractCdxGrobid(MRJob): if __name__ == '__main__': # pragma: no cover MRExtractCdxGrobid.run() - |