aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-17 03:21:04 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-17 03:21:04 +0100
commit3c663e00810d064e88d3f9240c6b1cc1036dc014 (patch)
treed430aee16a1a16659dcf01abfc0cfe1564c9a896
parent70894830001d2698d341f085fc3f2c6409171342 (diff)
downloadfuzzycat-3c663e00810d064e88d3f9240c6b1cc1036dc014.tar.gz
fuzzycat-3c663e00810d064e88d3f9240c6b1cc1036dc014.zip
update stats
-rw-r--r--fuzzycat/verify.py12
1 files changed, 7 insertions, 5 deletions
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 180a6ac..b97f70a 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -26,17 +26,17 @@ Examples from clustering stage (from a sample of 100k records):
WIP:
{
- "miss.blacklisted": 926,
- "miss.contrib_intersection_empty": 3218,
+ "miss.blacklisted": 956,
+ "miss.contrib_intersection_empty": 3217,
"miss.dataset_doi": 8723,
"miss.num_diff": 14914,
"miss.release_type": 14305,
"miss.short_title": 3315,
"miss.subtitle": 102,
"miss.vhs": 45,
- "miss.year": 12332,
+ "miss.year": 12321,
"ok.arxiv_version": 13,
- "ok.dummy": 8640,
+ "ok.dummy": 8622,
"ok.preprint_published": 7,
"ok.slug_title_author_match": 498,
"ok.title_author_match": 6187,
@@ -47,7 +47,6 @@ WIP:
"total": 904844
}
-
"""
import collections
@@ -64,6 +63,9 @@ get_key_values = operator.itemgetter("k", "v")
# There titles appear too often, so ignore them for now.
TITLE_BLACKLIST = set([
+ "Medical Annotations",
+ "Medical Annotations.",
+ "Boundary Creek Times",
"",
":{unav)",
"[others]",