pass more pylint

author: Bryan Newbold <bnewbold@archive.org> 2018-09-18 10:43:44 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2018-09-18 10:43:44 -0700
commit: c99a366003ed399f0e0dd17cc8086dbccd078279 (patch)
tree: 93d031dcf72c1592b9b904df6cd7cf08c3255e4e
parent: 6d96f550d63313a9acebb661e78edbfdcf936956 (diff)
download: sandcrawler-c99a366003ed399f0e0dd17cc8086dbccd078279.tar.gz
sandcrawler-c99a366003ed399f0e0dd17cc8086dbccd078279.zip
3 files changed, 23 insertions, 24 deletions
diff --git a/python/.pylintrc b/python/.pylintrc
index 78e9e7f..0b3342d 100644
--- a/python/.pylintrc
+++ b/python/.pylintrc
@@ -1,5 +1,5 @@
 [MESSAGES CONTROL]
-disable=C0323,W0142,C0301,C0103,C0111,E0213,C0302,C0203,W0703,R0201,W0223,bad-continuation,arguments-differ,unidiomatic-typecheck
+disable=C0323,W0142,C0301,C0103,C0111,E0213,C0302,C0203,W0703,R0201,W0223,bad-continuation,arguments-differ,unidiomatic-typecheck,len-as-condition
 
 [REPORTS]
 output-format=colorized
diff --git a/python/filter_scored_matches.py b/python/filter_scored_matches.py
index a656705..900374d 100755
--- a/python/filter_scored_matches.py
+++ b/python/filter_scored_matches.py
@@ -38,34 +38,34 @@ def check_authors(left, right):
     Intended to check GROBID extracted authors (right) against "known good"
     (but maybe not perfect) Crossref metadata authors ("left").
     """
-    if len(left) == 0:
+    if not left:
         return False
     if len(left) > len(right):
         return False
     right_all = tokenize(" ".join(right))
     for i in range(len(left)):
         l = left[i].lower().replace('jr.', '').split()
-        if len(l) == 0:
+        if not l:
             return False
         l = tokenize(l[-1])
         if len(l) <= 1:
             # weird author name (single char)
             return False
-        if not l in right_all:
+        if l not in right_all:
             #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
             return False
     return True
 
 def test_check_authors():
-    assert False == check_authors([], [])
-    assert False == check_authors([], ['one'])
-    assert True == check_authors(['one'], ['one'])
-    assert True == check_authors(['one two'], ['One Two'])
-    assert True == check_authors(['two'], ['One Two'])
-    assert True == check_authors(['two'], ['two, one'])
-    assert True == check_authors(['mago'], ['Mr. Magoo'])
-    assert True == check_authors(['Mr. Magoo'], ['Mr Magoo'])
-    assert True == check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
+    assert not check_authors([], [])
+    assert not check_authors([], ['one'])
+    assert check_authors(['one'], ['one'])
+    assert check_authors(['one two'], ['One Two'])
+    assert check_authors(['two'], ['One Two'])
+    assert check_authors(['two'], ['two, one'])
+    assert check_authors(['mago'], ['Mr. Magoo'])
+    assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
+    assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
 
 # Rows are (score, grobid, crossref)
 def process_group(rows):
@@ -89,7 +89,7 @@ def process_group(rows):
         l = keepers.get(sha1, list())
         l.append(doi)
         keepers[sha1] = l
-    for key, value in keepers.items():
+    for value in keepers.values():
         print("{}\t{}".format(sha1, json.dumps(value)))
 
 def run():
@@ -100,15 +100,16 @@ def run():
     # group lines by slug, and process in batches
     for line in sys.stdin:
         line = line.strip().split('\t')
-        assert(len(line) == 4)
+        assert len(line) == 4
         slug = line[0]
-        if last_slug and slug != last_slug and len(lines) > 0:
+        if last_slug and slug != last_slug and lines:
             process_group(lines)
             lines = []
         last_slug = slug
         lines.append(line[1:])
 
-    if len(lines) > 0:
+    # catch any remaining
+    if lines:
         process_group(lines)
 
 if __name__=='__main__':
diff --git a/python/manifest_converter.py b/python/manifest_converter.py
index f0d0bc7..35cee5b 100755
--- a/python/manifest_converter.py
+++ b/python/manifest_converter.py
@@ -13,7 +13,6 @@ to JSON format for fast fatcat importing.
 import sys
 import json
 import sqlite3
-import itertools
 
 # iterate over rows in files metadata...
 # 1. select all identified DOIs
@@ -24,23 +23,22 @@ import itertools
 def or_none(s):
     if s is None:
         return None
-    elif type(s) == str and (len(s) == 0 or s == "\\N" or s == "-"):
+    elif type(s) == str and ((not s) or s == "\\N" or s == "-"):
         return None
-    else:
-        return s
+    return s
 
 def process_db(db_path):
-    
+
     db = sqlite3.connect(db_path)
 
     for row in db.execute("SELECT sha1, mimetype, size_bytes, md5 FROM files_metadata"):
         sha1 = row[0]
         dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1]).fetchall()
         dois = [d[0] for d in dois]
-        if len(dois) == 0:
+        if not dois:
             continue
         urls = db.execute("SELECT url, datetime FROM urls WHERE sha1=?", [sha1]).fetchall()
-        if len(urls) == 0:
+        if not urls:
             continue
         cdx = [dict(url=row[0], dt=row[1]) for row in urls]
         obj = dict(
author	Bryan Newbold <bnewbold@archive.org>	2018-09-18 10:43:44 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2018-09-18 10:43:44 -0700
commit	c99a366003ed399f0e0dd17cc8086dbccd078279 (patch)
tree	93d031dcf72c1592b9b904df6cd7cf08c3255e4e
parent	6d96f550d63313a9acebb661e78edbfdcf936956 (diff)
download	sandcrawler-c99a366003ed399f0e0dd17cc8086dbccd078279.tar.gz sandcrawler-c99a366003ed399f0e0dd17cc8086dbccd078279.zip