diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/.pylintrc | 2 | ||||
-rwxr-xr-x | python/filter_scored_matches.py | 33 | ||||
-rwxr-xr-x | python/manifest_converter.py | 12 |
3 files changed, 23 insertions, 24 deletions
diff --git a/python/.pylintrc b/python/.pylintrc index 78e9e7f..0b3342d 100644 --- a/python/.pylintrc +++ b/python/.pylintrc @@ -1,5 +1,5 @@ [MESSAGES CONTROL] -disable=C0323,W0142,C0301,C0103,C0111,E0213,C0302,C0203,W0703,R0201,W0223,bad-continuation,arguments-differ,unidiomatic-typecheck +disable=C0323,W0142,C0301,C0103,C0111,E0213,C0302,C0203,W0703,R0201,W0223,bad-continuation,arguments-differ,unidiomatic-typecheck,len-as-condition [REPORTS] output-format=colorized diff --git a/python/filter_scored_matches.py b/python/filter_scored_matches.py index a656705..900374d 100755 --- a/python/filter_scored_matches.py +++ b/python/filter_scored_matches.py @@ -38,34 +38,34 @@ def check_authors(left, right): Intended to check GROBID extracted authors (right) against "known good" (but maybe not perfect) Crossref metadata authors ("left"). """ - if len(left) == 0: + if not left: return False if len(left) > len(right): return False right_all = tokenize(" ".join(right)) for i in range(len(left)): l = left[i].lower().replace('jr.', '').split() - if len(l) == 0: + if not l: return False l = tokenize(l[-1]) if len(l) <= 1: # weird author name (single char) return False - if not l in right_all: + if l not in right_all: #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8'))) return False return True def test_check_authors(): - assert False == check_authors([], []) - assert False == check_authors([], ['one']) - assert True == check_authors(['one'], ['one']) - assert True == check_authors(['one two'], ['One Two']) - assert True == check_authors(['two'], ['One Two']) - assert True == check_authors(['two'], ['two, one']) - assert True == check_authors(['mago'], ['Mr. Magoo']) - assert True == check_authors(['Mr. Magoo'], ['Mr Magoo']) - assert True == check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three']) + assert not check_authors([], []) + assert not check_authors([], ['one']) + assert check_authors(['one'], ['one']) + assert check_authors(['one two'], ['One Two']) + assert check_authors(['two'], ['One Two']) + assert check_authors(['two'], ['two, one']) + assert check_authors(['mago'], ['Mr. Magoo']) + assert check_authors(['Mr. Magoo'], ['Mr Magoo']) + assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three']) # Rows are (score, grobid, crossref) def process_group(rows): @@ -89,7 +89,7 @@ def process_group(rows): l = keepers.get(sha1, list()) l.append(doi) keepers[sha1] = l - for key, value in keepers.items(): + for value in keepers.values(): print("{}\t{}".format(sha1, json.dumps(value))) def run(): @@ -100,15 +100,16 @@ def run(): # group lines by slug, and process in batches for line in sys.stdin: line = line.strip().split('\t') - assert(len(line) == 4) + assert len(line) == 4 slug = line[0] - if last_slug and slug != last_slug and len(lines) > 0: + if last_slug and slug != last_slug and lines: process_group(lines) lines = [] last_slug = slug lines.append(line[1:]) - if len(lines) > 0: + # catch any remaining + if lines: process_group(lines) if __name__=='__main__': diff --git a/python/manifest_converter.py b/python/manifest_converter.py index f0d0bc7..35cee5b 100755 --- a/python/manifest_converter.py +++ b/python/manifest_converter.py @@ -13,7 +13,6 @@ to JSON format for fast fatcat importing. import sys import json import sqlite3 -import itertools # iterate over rows in files metadata... # 1. select all identified DOIs @@ -24,23 +23,22 @@ import itertools def or_none(s): if s is None: return None - elif type(s) == str and (len(s) == 0 or s == "\\N" or s == "-"): + elif type(s) == str and ((not s) or s == "\\N" or s == "-"): return None - else: - return s + return s def process_db(db_path): - + db = sqlite3.connect(db_path) for row in db.execute("SELECT sha1, mimetype, size_bytes, md5 FROM files_metadata"): sha1 = row[0] dois = db.execute("SELECT doi FROM files_id_doi WHERE sha1=?", [sha1]).fetchall() dois = [d[0] for d in dois] - if len(dois) == 0: + if not dois: continue urls = db.execute("SELECT url, datetime FROM urls WHERE sha1=?", [sha1]).fetchall() - if len(urls) == 0: + if not urls: continue cdx = [dict(url=row[0], dt=row[1]) for row in urls] obj = dict( |