From 3c663e00810d064e88d3f9240c6b1cc1036dc014 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 17 Nov 2020 03:21:04 +0100 Subject: update stats --- fuzzycat/verify.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 180a6ac..b97f70a 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -26,17 +26,17 @@ Examples from clustering stage (from a sample of 100k records): WIP: { - "miss.blacklisted": 926, - "miss.contrib_intersection_empty": 3218, + "miss.blacklisted": 956, + "miss.contrib_intersection_empty": 3217, "miss.dataset_doi": 8723, "miss.num_diff": 14914, "miss.release_type": 14305, "miss.short_title": 3315, "miss.subtitle": 102, "miss.vhs": 45, - "miss.year": 12332, + "miss.year": 12321, "ok.arxiv_version": 13, - "ok.dummy": 8640, + "ok.dummy": 8622, "ok.preprint_published": 7, "ok.slug_title_author_match": 498, "ok.title_author_match": 6187, @@ -47,7 +47,6 @@ WIP: "total": 904844 } - """ import collections @@ -64,6 +63,9 @@ get_key_values = operator.itemgetter("k", "v") # There titles appear too often, so ignore them for now. TITLE_BLACKLIST = set([ + "Medical Annotations", + "Medical Annotations.", + "Boundary Creek Times", "", ":{unav)", "[others]", -- cgit v1.2.3