aboutsummaryrefslogtreecommitdiffstats
path: root/mapreduce
diff options
context:
space:
mode:
Diffstat (limited to 'mapreduce')
-rw-r--r--mapreduce/.pylintrc2
-rwxr-xr-xmapreduce/backfill_hbase_from_cdx.py13
-rw-r--r--mapreduce/common.py10
-rwxr-xr-xmapreduce/grobid2json.py2
-rw-r--r--mapreduce/xml2json.py1
5 files changed, 10 insertions, 18 deletions
diff --git a/mapreduce/.pylintrc b/mapreduce/.pylintrc
index 2ec9967..5dc3ce0 100644
--- a/mapreduce/.pylintrc
+++ b/mapreduce/.pylintrc
@@ -1,5 +1,5 @@
[MESSAGES CONTROL]
-disable=C0323,W0142,C0301,C0103,C0111,E0213,C0302,C0203,W0703,R0201,W0223
+disable=C0323,W0142,C0301,C0103,C0111,E0213,C0302,C0203,W0703,R0201,W0223,bad-continuation,arguments-differ,unidiomatic-typecheck
[REPORTS]
output-format=colorized
diff --git a/mapreduce/backfill_hbase_from_cdx.py b/mapreduce/backfill_hbase_from_cdx.py
index 2643195..72331b0 100755
--- a/mapreduce/backfill_hbase_from_cdx.py
+++ b/mapreduce/backfill_hbase_from_cdx.py
@@ -16,7 +16,6 @@ TODO:
- sentry integration for error reporting
"""
-import sys
import json
import happybase
import mrjob
@@ -56,8 +55,8 @@ class MRCDXBackfillHBase(MRJob):
host = self.options.hbase_host
# TODO: make these configs accessible from... mrconf.cfg?
hb_conn = happybase.Connection(host=host, transport="framed",
- protocol="compact")
- except Exception as err:
+ protocol="compact")
+ except Exception:
raise Exception("Couldn't connect to HBase using host: {}".format(host))
self.hb_table = hb_conn.table(self.options.hbase_table)
@@ -67,9 +66,6 @@ class MRCDXBackfillHBase(MRJob):
if (raw_cdx.startswith(' ') or raw_cdx.startswith('filedesc') or
raw_cdx.startswith('#')):
-
- # Skip line
- # XXX: tests don't cover this path; need coverage!
self.increment_counter('lines', 'invalid')
yield _, dict(status="invalid", reason="line prefix")
return
@@ -87,8 +83,8 @@ class MRCDXBackfillHBase(MRJob):
key = info.pop('key')
info['f:c'] = json.dumps(info['f:c'], sort_keys=True, indent=None)
- info['file:cdx'] = json.dumps(info['file:cdx'], sort_keys=True,
- indent=None)
+ info['file:cdx'] = json.dumps(info['file:cdx'],
+ sort_keys=True, indent=None)
self.hb_table.put(key, info)
self.increment_counter('lines', 'success')
@@ -97,4 +93,3 @@ class MRCDXBackfillHBase(MRJob):
if __name__ == '__main__': # pragma: no cover
MRCDXBackfillHBase.run()
-
diff --git a/mapreduce/common.py b/mapreduce/common.py
index 1b8e572..6710044 100644
--- a/mapreduce/common.py
+++ b/mapreduce/common.py
@@ -14,7 +14,7 @@ def normalize_mime(raw):
if raw.startswith(norm):
return norm
- # Special cases
+ # Special cases
if raw.startswith('application/xml'):
return 'text/xml'
if raw.startswith('application/x-pdf'):
@@ -23,14 +23,14 @@ def normalize_mime(raw):
def test_normalize_mime():
- assert normalize_mime("asdf") == None
+ assert normalize_mime("asdf") is None
assert normalize_mime("application/pdf") == "application/pdf"
assert normalize_mime("application/pdf+journal") == "application/pdf"
assert normalize_mime("Application/PDF") == "application/pdf"
- assert normalize_mime("application/p") == None
+ assert normalize_mime("application/p") is None
assert normalize_mime("application/xml+stuff") == "text/xml"
assert normalize_mime("application/x-pdf") == "application/pdf"
- assert normalize_mime("application/x-html") == None
+ assert normalize_mime("application/x-html") is None
def parse_cdx_line(raw_cdx):
@@ -65,7 +65,7 @@ def parse_cdx_line(raw_cdx):
warc_file = warc.split('/')[-1]
try:
dt_iso = datetime.strptime(dt, "%Y%m%d%H%M%S").isoformat()
- except:
+ except Exception:
return None
# 'i' intentionally not set
diff --git a/mapreduce/grobid2json.py b/mapreduce/grobid2json.py
index 52a3125..c1ff1f1 100755
--- a/mapreduce/grobid2json.py
+++ b/mapreduce/grobid2json.py
@@ -23,9 +23,7 @@ A flag can be specified to disable copyright encumbered bits (--no-emcumbered):
Prints JSON to stdout, errors to stderr
"""
-import os
import io
-import sys
import json
import argparse
import xml.etree.ElementTree as ET
diff --git a/mapreduce/xml2json.py b/mapreduce/xml2json.py
index f956014..df5064f 100644
--- a/mapreduce/xml2json.py
+++ b/mapreduce/xml2json.py
@@ -1,6 +1,5 @@
import json
-import sys
import xmltodict
with open('tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml', 'rb') as f: