diff options
Diffstat (limited to 'python/scripts/filter_groupworks.py')
-rwxr-xr-x | python/scripts/filter_groupworks.py | 8 |
1 files changed, 7 insertions, 1 deletions
diff --git a/python/scripts/filter_groupworks.py b/python/scripts/filter_groupworks.py index 494da71..fda9098 100755 --- a/python/scripts/filter_groupworks.py +++ b/python/scripts/filter_groupworks.py @@ -28,6 +28,7 @@ MAX_SLUG_LINES = 50 REQUIRE_AUTHORS = False + def tokenize(s, remove_whitespace=False): s.replace(''', "'") @@ -40,6 +41,7 @@ def tokenize(s, remove_whitespace=False): # Encode as dumb ASCII (TODO: this is horrible) return s.encode('ascii', 'replace').replace(b'?', b'') + def check_authors(left, right): """ Intended to check GROBID extracted authors (right) against "known good" @@ -63,6 +65,7 @@ def check_authors(left, right): return False return True + def test_check_authors(): assert check_authors([], []) == bool(not REQUIRE_AUTHORS) assert not check_authors([], ['one']) @@ -74,6 +77,7 @@ def test_check_authors(): assert check_authors(['Mr. Magoo'], ['Mr Magoo']) assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three']) + # Rows are (score, left, right) def process_group(rows): @@ -119,6 +123,7 @@ def process_group(rows): print(json.dumps([releases[ident] for ident in group_ids])) + def run(): last_slug = None @@ -140,5 +145,6 @@ def run(): if lines: process_group(lines) -if __name__=='__main__': + +if __name__ == '__main__': run() |