From 39b35a1274ac7ebdea88d2913bef8d0357063ff3 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 17 Dec 2020 16:27:33 +0100 Subject: update stats --- fuzzycat/verify.py | 81 +++++++++++++++++++++++++++--------------------------- 1 file changed, 40 insertions(+), 41 deletions(-) diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 6b0a448..1cf8f3c 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -33,47 +33,46 @@ Stats from running over a full database dump. We need to run verification over 25586837 entity pairs, of which we 1346217/25586837 (or about 5%) are too ambiguous at this time. -TODO: rerun to adjust formatting. - - 3450874 OK.TITLE_AUTHOR_MATCH - 2619990 OK.SLUG_TITLE_AUTHOR_MATCH - 2487633 Miss.YEAR - 2434532 OK.WORK_ID - 2085006 Miss.CONTRIB_INTERSECTION_EMPTY - 1397420 Miss.SHARED_DOI_PREFIX - 1355852 Miss.RELEASE_TYPE - 1346217 OK.DUMMY - 1145511 Miss.BOOK_CHAPTER - 1009657 Miss.DATASET_DOI - 996503 OK.PMID_DOI_PAIR - 868951 OK.DATACITE_VERSION - 796216 OK.DATACITE_RELATED_ID - 704154 OK.FIGSHARE_VERSION - 534963 OK.VERSIONED_DOI - 343310 OK.TOKENIZED_AUTHORS - 334974 OK.JACCARD_AUTHORS - 293835 OK.PREPRINT_PUBLISHED - 269366 Miss.COMPONENT - 263626 Miss.SUBTITLE - 224021 Miss.SHORT_TITLE - 133811 Miss.CUSTOM_PREFIX_10_5860_CHOICE_REVIEW - 122600 Miss.CUSTOM_PREFIX_10_7916 - 96935 Miss.PAGE_COUNT - 79664 OK.CUSTOM_IEEE_ARXIV - 46649 Miss.CUSTOM_PREFIX_10_14288 - 39797 Miss.JSTOR_ID - 38598 OK.CUSTOM_BSI_UNDATED - 18907 OK.CUSTOM_BSI_SUBDOC - 15465 OK.DOI - 13393 Miss.CUSTOM_IOP_MA_PATTERN - 10378 Miss.CONTAINER - 3081 Miss.BLACKLISTED - 2504 Miss.BLACKLISTED_FRAGMENT - 1273 Miss.APPENDIX - 1063 Miss.TITLE_FILENAME - 104 Miss.NUM_DIFF - 4 OK.ARXIV_VERSION - +Found Status Reason +-------------------------------------------------------------------------- +3450874 Status.EXACT Reason.TITLE_AUTHOR_MATCH +2619990 Status.STRONG Reason.SLUG_TITLE_AUTHOR_MATCH +2487633 Status.DIFFERENT Reason.YEAR +2434532 Status.EXACT Reason.WORK_ID +2085006 Status.DIFFERENT Reason.CONTRIB_INTERSECTION_EMPTY +1397420 Status.DIFFERENT Reason.SHARED_DOI_PREFIX +1355852 Status.DIFFERENT Reason.RELEASE_TYPE +1290162 Status.AMBIGUOUS Reason.DUMMY +1145511 Status.DIFFERENT Reason.BOOK_CHAPTER +1009657 Status.DIFFERENT Reason.DATASET_DOI + 996503 Status.STRONG Reason.PMID_DOI_PAIR + 868951 Status.EXACT Reason.DATACITE_VERSION + 796216 Status.STRONG Reason.DATACITE_RELATED_ID + 704154 Status.STRONG Reason.FIGSHARE_VERSION + 534963 Status.STRONG Reason.VERSIONED_DOI + 343310 Status.STRONG Reason.TOKENIZED_AUTHORS + 334974 Status.STRONG Reason.JACCARD_AUTHORS + 293835 Status.STRONG Reason.PREPRINT_PUBLISHED + 269366 Status.DIFFERENT Reason.COMPONENT + 263626 Status.DIFFERENT Reason.SUBTITLE + 224021 Status.AMBIGUOUS Reason.SHORT_TITLE + 152990 Status.DIFFERENT Reason.PAGE_COUNT + 133811 Status.AMBIGUOUS Reason.CUSTOM_PREFIX_10_5860_CHOICE_REVIEW + 122600 Status.AMBIGUOUS Reason.CUSTOM_PREFIX_10_7916 + 79664 Status.STRONG Reason.CUSTOM_IEEE_ARXIV + 46649 Status.DIFFERENT Reason.CUSTOM_PREFIX_10_14288 + 39797 Status.DIFFERENT Reason.JSTOR_ID + 38598 Status.STRONG Reason.CUSTOM_BSI_UNDATED + 18907 Status.STRONG Reason.CUSTOM_BSI_SUBDOC + 15465 Status.EXACT Reason.DOI + 13393 Status.DIFFERENT Reason.CUSTOM_IOP_MA_PATTERN + 10378 Status.DIFFERENT Reason.CONTAINER + 3081 Status.AMBIGUOUS Reason.BLACKLISTED + 2504 Status.AMBIGUOUS Reason.BLACKLISTED_FRAGMENT + 1273 Status.AMBIGUOUS Reason.APPENDIX + 1063 Status.DIFFERENT Reason.TITLE_FILENAME + 104 Status.DIFFERENT Reason.NUM_DIFF + 4 Status.STRONG Reason.ARXIV_VERSION """ import collections -- cgit v1.2.3