aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-09-17 14:28:30 -0700
committerBryan Newbold <bnewbold@archive.org>2018-09-17 14:28:30 -0700
commit33a8f5b630ff52fcce10abfc272e2d8607ff591b (patch)
treec0973d2c230f95a11056829a6e8f21cd3976d2be /python
parent710a0feab36f83eef21885ee7c23e5841cae1e87 (diff)
downloadsandcrawler-33a8f5b630ff52fcce10abfc272e2d8607ff591b.tar.gz
sandcrawler-33a8f5b630ff52fcce10abfc272e2d8607ff591b.zip
filter_scored_matches: fix tests
Diffstat (limited to 'python')
-rwxr-xr-xpython/filter_scored_matches.py9
1 files changed, 7 insertions, 2 deletions
diff --git a/python/filter_scored_matches.py b/python/filter_scored_matches.py
index 819e13c..a656705 100755
--- a/python/filter_scored_matches.py
+++ b/python/filter_scored_matches.py
@@ -34,6 +34,10 @@ def tokenize(s, remove_whitespace=False):
return s.encode('ascii', 'replace').replace(b'?', b'')
def check_authors(left, right):
+ """
+ Intended to check GROBID extracted authors (right) against "known good"
+ (but maybe not perfect) Crossref metadata authors ("left").
+ """
if len(left) == 0:
return False
if len(left) > len(right):
@@ -59,8 +63,9 @@ def test_check_authors():
assert True == check_authors(['one two'], ['One Two'])
assert True == check_authors(['two'], ['One Two'])
assert True == check_authors(['two'], ['two, one'])
- assert True == check_authors(['Mr. Magoo'], ['mago'])
- assert True == check_authors(['one', 'two', 'three'], ['one', 'tw', 'thr'])
+ assert True == check_authors(['mago'], ['Mr. Magoo'])
+ assert True == check_authors(['Mr. Magoo'], ['Mr Magoo'])
+ assert True == check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
# Rows are (score, grobid, crossref)
def process_group(rows):