diff options
Diffstat (limited to 'mapreduce')
-rw-r--r-- | mapreduce/.pylintrc | 2 | ||||
-rwxr-xr-x | mapreduce/backfill_hbase_from_cdx.py | 13 | ||||
-rw-r--r-- | mapreduce/common.py | 10 | ||||
-rwxr-xr-x | mapreduce/grobid2json.py | 2 | ||||
-rw-r--r-- | mapreduce/xml2json.py | 1 |
5 files changed, 10 insertions, 18 deletions
diff --git a/mapreduce/.pylintrc b/mapreduce/.pylintrc index 2ec9967..5dc3ce0 100644 --- a/mapreduce/.pylintrc +++ b/mapreduce/.pylintrc @@ -1,5 +1,5 @@ [MESSAGES CONTROL] -disable=C0323,W0142,C0301,C0103,C0111,E0213,C0302,C0203,W0703,R0201,W0223 +disable=C0323,W0142,C0301,C0103,C0111,E0213,C0302,C0203,W0703,R0201,W0223,bad-continuation,arguments-differ,unidiomatic-typecheck [REPORTS] output-format=colorized diff --git a/mapreduce/backfill_hbase_from_cdx.py b/mapreduce/backfill_hbase_from_cdx.py index 2643195..72331b0 100755 --- a/mapreduce/backfill_hbase_from_cdx.py +++ b/mapreduce/backfill_hbase_from_cdx.py @@ -16,7 +16,6 @@ TODO: - sentry integration for error reporting """ -import sys import json import happybase import mrjob @@ -56,8 +55,8 @@ class MRCDXBackfillHBase(MRJob): host = self.options.hbase_host # TODO: make these configs accessible from... mrconf.cfg? hb_conn = happybase.Connection(host=host, transport="framed", - protocol="compact") - except Exception as err: + protocol="compact") + except Exception: raise Exception("Couldn't connect to HBase using host: {}".format(host)) self.hb_table = hb_conn.table(self.options.hbase_table) @@ -67,9 +66,6 @@ class MRCDXBackfillHBase(MRJob): if (raw_cdx.startswith(' ') or raw_cdx.startswith('filedesc') or raw_cdx.startswith('#')): - - # Skip line - # XXX: tests don't cover this path; need coverage! self.increment_counter('lines', 'invalid') yield _, dict(status="invalid", reason="line prefix") return @@ -87,8 +83,8 @@ class MRCDXBackfillHBase(MRJob): key = info.pop('key') info['f:c'] = json.dumps(info['f:c'], sort_keys=True, indent=None) - info['file:cdx'] = json.dumps(info['file:cdx'], sort_keys=True, - indent=None) + info['file:cdx'] = json.dumps(info['file:cdx'], + sort_keys=True, indent=None) self.hb_table.put(key, info) self.increment_counter('lines', 'success') @@ -97,4 +93,3 @@ class MRCDXBackfillHBase(MRJob): if __name__ == '__main__': # pragma: no cover MRCDXBackfillHBase.run() - diff --git a/mapreduce/common.py b/mapreduce/common.py index 1b8e572..6710044 100644 --- a/mapreduce/common.py +++ b/mapreduce/common.py @@ -14,7 +14,7 @@ def normalize_mime(raw): if raw.startswith(norm): return norm - # Special cases + # Special cases if raw.startswith('application/xml'): return 'text/xml' if raw.startswith('application/x-pdf'): @@ -23,14 +23,14 @@ def normalize_mime(raw): def test_normalize_mime(): - assert normalize_mime("asdf") == None + assert normalize_mime("asdf") is None assert normalize_mime("application/pdf") == "application/pdf" assert normalize_mime("application/pdf+journal") == "application/pdf" assert normalize_mime("Application/PDF") == "application/pdf" - assert normalize_mime("application/p") == None + assert normalize_mime("application/p") is None assert normalize_mime("application/xml+stuff") == "text/xml" assert normalize_mime("application/x-pdf") == "application/pdf" - assert normalize_mime("application/x-html") == None + assert normalize_mime("application/x-html") is None def parse_cdx_line(raw_cdx): @@ -65,7 +65,7 @@ def parse_cdx_line(raw_cdx): warc_file = warc.split('/')[-1] try: dt_iso = datetime.strptime(dt, "%Y%m%d%H%M%S").isoformat() - except: + except Exception: return None # 'i' intentionally not set diff --git a/mapreduce/grobid2json.py b/mapreduce/grobid2json.py index 52a3125..c1ff1f1 100755 --- a/mapreduce/grobid2json.py +++ b/mapreduce/grobid2json.py @@ -23,9 +23,7 @@ A flag can be specified to disable copyright encumbered bits (--no-emcumbered): Prints JSON to stdout, errors to stderr """ -import os import io -import sys import json import argparse import xml.etree.ElementTree as ET diff --git a/mapreduce/xml2json.py b/mapreduce/xml2json.py index f956014..df5064f 100644 --- a/mapreduce/xml2json.py +++ b/mapreduce/xml2json.py @@ -1,6 +1,5 @@ import json -import sys import xmltodict with open('tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml', 'rb') as f: |