diff options
Diffstat (limited to 'python/scripts/filter_scored_matches.py')
-rwxr-xr-x | python/scripts/filter_scored_matches.py | 49 |
1 files changed, 27 insertions, 22 deletions
diff --git a/python/scripts/filter_scored_matches.py b/python/scripts/filter_scored_matches.py index 3654b87..c5b7eef 100755 --- a/python/scripts/filter_scored_matches.py +++ b/python/scripts/filter_scored_matches.py @@ -10,8 +10,8 @@ matches, and outputs one-line-per-sha1 (aka, file). No dependencies (only python3 stdlib) """ -import sys import json +import sys # out of 1000 score_threshold = 900 @@ -23,15 +23,16 @@ require_authors = 1 def tokenize(s, remove_whitespace=False): - s.replace(''', "'") + s.replace("'", "'") # Remove non-alphanumeric characters - s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()]) + s = "".join([c for c in s.lower() if c.isalnum() or c.isspace()]) if remove_whitespace: - s = ''.join(s.split()) + s = "".join(s.split()) # Encode as dumb ASCII (TODO: this is horrible) - return s.encode('ascii', 'replace').replace(b'?', b'') + return s.encode("ascii", "replace").replace(b"?", b"") + def check_authors(left, right): """ @@ -44,7 +45,7 @@ def check_authors(left, right): return False right_all = tokenize(" ".join(right)) for i in range(len(left)): - l = left[i].lower().replace('jr.', '').split() + l = left[i].lower().replace("jr.", "").split() if not l: return False l = tokenize(l[-1]) @@ -52,20 +53,22 @@ def check_authors(left, right): # weird author name (single char) return False if l not in right_all: - #print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8'))) + # print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8'))) return False return True + def test_check_authors(): assert not check_authors([], []) - assert not check_authors([], ['one']) - assert check_authors(['one'], ['one']) - assert check_authors(['one two'], ['One Two']) - assert check_authors(['two'], ['One Two']) - assert check_authors(['two'], ['two, one']) - assert check_authors(['mago'], ['Mr. Magoo']) - assert check_authors(['Mr. Magoo'], ['Mr Magoo']) - assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three']) + assert not check_authors([], ["one"]) + assert check_authors(["one"], ["one"]) + assert check_authors(["one two"], ["One Two"]) + assert check_authors(["two"], ["One Two"]) + assert check_authors(["two"], ["two, one"]) + assert check_authors(["mago"], ["Mr. Magoo"]) + assert check_authors(["Mr. Magoo"], ["Mr Magoo"]) + assert check_authors(["one", "tw", "thr"], ["one", "two", "three"]) + # Rows are (score, grobid, crossref) def process_group(rows): @@ -78,20 +81,21 @@ def process_group(rows): continue grobid = json.loads(row[1]) crossref = json.loads(row[2]) - if not check_authors(crossref['authors'], grobid['authors']): - #print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors'])) + if not check_authors(crossref["authors"], grobid["authors"]): + # print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors'])) continue else: - #print("YES: {} {}".format(crossref['authors'], grobid['authors'])) + # print("YES: {} {}".format(crossref['authors'], grobid['authors'])) pass - sha1 = grobid['sha1'] - doi = crossref['doi'].lower() + sha1 = grobid["sha1"] + doi = crossref["doi"].lower() l = keepers.get(sha1, list()) l.append(doi) keepers[sha1] = l for sha1, doi_list in keepers.items(): print("{}\t{}".format(sha1, json.dumps(doi_list))) + def run(): last_slug = None @@ -99,7 +103,7 @@ def run(): # group lines by slug, and process in batches for line in sys.stdin: - line = line.strip().split('\t') + line = line.strip().split("\t") assert len(line) == 4 slug = line[0] if last_slug and slug != last_slug and lines: @@ -112,5 +116,6 @@ def run(): if lines: process_group(lines) -if __name__=='__main__': + +if __name__ == "__main__": run() |