diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-26 12:54:37 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-26 12:54:37 -0700 |
commit | 05bd7cbcc62588e431c5efd533189e246b2a997e (patch) | |
tree | abcc707a451e77ea1e8c5ac9a5925b97a4bd139a /python/scripts/filter_scored_matches.py | |
parent | f3f424e42f2f4f383103cf80b30a00cfa6cfc179 (diff) | |
download | sandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.tar.gz sandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.zip |
make fmt
Diffstat (limited to 'python/scripts/filter_scored_matches.py')
-rwxr-xr-x | python/scripts/filter_scored_matches.py | 7 |
1 files changed, 6 insertions, 1 deletions
diff --git a/python/scripts/filter_scored_matches.py b/python/scripts/filter_scored_matches.py index abf81bd..3251852 100755 --- a/python/scripts/filter_scored_matches.py +++ b/python/scripts/filter_scored_matches.py @@ -33,6 +33,7 @@ def tokenize(s, remove_whitespace=False): # Encode as dumb ASCII (TODO: this is horrible) return s.encode('ascii', 'replace').replace(b'?', b'') + def check_authors(left, right): """ Intended to check GROBID extracted authors (right) against "known good" @@ -56,6 +57,7 @@ def check_authors(left, right): return False return True + def test_check_authors(): assert not check_authors([], []) assert not check_authors([], ['one']) @@ -67,6 +69,7 @@ def test_check_authors(): assert check_authors(['Mr. Magoo'], ['Mr Magoo']) assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three']) + # Rows are (score, grobid, crossref) def process_group(rows): if len(rows) > max_slug_lines: @@ -92,6 +95,7 @@ def process_group(rows): for sha1, doi_list in keepers.items(): print("{}\t{}".format(sha1, json.dumps(doi_list))) + def run(): last_slug = None @@ -112,5 +116,6 @@ def run(): if lines: process_group(lines) -if __name__=='__main__': + +if __name__ == '__main__': run() |