aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xpython/filter_scored_matches.py9
1 files changed, 7 insertions, 2 deletions
diff --git a/python/filter_scored_matches.py b/python/filter_scored_matches.py
index 819e13c..a656705 100755
--- a/python/filter_scored_matches.py
+++ b/python/filter_scored_matches.py
@@ -34,6 +34,10 @@ def tokenize(s, remove_whitespace=False):
return s.encode('ascii', 'replace').replace(b'?', b'')
def check_authors(left, right):
+ """
+ Intended to check GROBID extracted authors (right) against "known good"
+ (but maybe not perfect) Crossref metadata authors ("left").
+ """
if len(left) == 0:
return False
if len(left) > len(right):
@@ -59,8 +63,9 @@ def test_check_authors():
assert True == check_authors(['one two'], ['One Two'])
assert True == check_authors(['two'], ['One Two'])
assert True == check_authors(['two'], ['two, one'])
- assert True == check_authors(['Mr. Magoo'], ['mago'])
- assert True == check_authors(['one', 'two', 'three'], ['one', 'tw', 'thr'])
+ assert True == check_authors(['mago'], ['Mr. Magoo'])
+ assert True == check_authors(['Mr. Magoo'], ['Mr Magoo'])
+ assert True == check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
# Rows are (score, grobid, crossref)
def process_group(rows):