diff options
-rwxr-xr-x | python/filter_scored_matches.py | 9 |
1 files changed, 7 insertions, 2 deletions
diff --git a/python/filter_scored_matches.py b/python/filter_scored_matches.py index 819e13c..a656705 100755 --- a/python/filter_scored_matches.py +++ b/python/filter_scored_matches.py @@ -34,6 +34,10 @@ def tokenize(s, remove_whitespace=False): return s.encode('ascii', 'replace').replace(b'?', b'') def check_authors(left, right): + """ + Intended to check GROBID extracted authors (right) against "known good" + (but maybe not perfect) Crossref metadata authors ("left"). + """ if len(left) == 0: return False if len(left) > len(right): @@ -59,8 +63,9 @@ def test_check_authors(): assert True == check_authors(['one two'], ['One Two']) assert True == check_authors(['two'], ['One Two']) assert True == check_authors(['two'], ['two, one']) - assert True == check_authors(['Mr. Magoo'], ['mago']) - assert True == check_authors(['one', 'two', 'three'], ['one', 'tw', 'thr']) + assert True == check_authors(['mago'], ['Mr. Magoo']) + assert True == check_authors(['Mr. Magoo'], ['Mr Magoo']) + assert True == check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three']) # Rows are (score, grobid, crossref) def process_group(rows): |