diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-11-17 03:21:04 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-11-17 03:21:04 +0100 |
commit | 3c663e00810d064e88d3f9240c6b1cc1036dc014 (patch) | |
tree | d430aee16a1a16659dcf01abfc0cfe1564c9a896 | |
parent | 70894830001d2698d341f085fc3f2c6409171342 (diff) | |
download | fuzzycat-3c663e00810d064e88d3f9240c6b1cc1036dc014.tar.gz fuzzycat-3c663e00810d064e88d3f9240c6b1cc1036dc014.zip |
update stats
-rw-r--r-- | fuzzycat/verify.py | 12 |
1 files changed, 7 insertions, 5 deletions
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 180a6ac..b97f70a 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -26,17 +26,17 @@ Examples from clustering stage (from a sample of 100k records): WIP: { - "miss.blacklisted": 926, - "miss.contrib_intersection_empty": 3218, + "miss.blacklisted": 956, + "miss.contrib_intersection_empty": 3217, "miss.dataset_doi": 8723, "miss.num_diff": 14914, "miss.release_type": 14305, "miss.short_title": 3315, "miss.subtitle": 102, "miss.vhs": 45, - "miss.year": 12332, + "miss.year": 12321, "ok.arxiv_version": 13, - "ok.dummy": 8640, + "ok.dummy": 8622, "ok.preprint_published": 7, "ok.slug_title_author_match": 498, "ok.title_author_match": 6187, @@ -47,7 +47,6 @@ WIP: "total": 904844 } - """ import collections @@ -64,6 +63,9 @@ get_key_values = operator.itemgetter("k", "v") # There titles appear too often, so ignore them for now. TITLE_BLACKLIST = set([ + "Medical Annotations", + "Medical Annotations.", + "Boundary Creek Times", "", ":{unav)", "[others]", |