aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-17 02:36:40 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-17 02:36:40 +0100
commite503c86cddd1591cccf1354b7c8ecd1a00d600e3 (patch)
tree1ba3bfcd689a581a337913b406d67a9e7d1ba0ba
parent8d708f337c8abbec61229f023637ddebcee827b5 (diff)
downloadfuzzycat-e503c86cddd1591cccf1354b7c8ecd1a00d600e3.tar.gz
fuzzycat-e503c86cddd1591cccf1354b7c8ecd1a00d600e3.zip
update docs and blacklist
-rw-r--r--fuzzycat/verify.py28
1 files changed, 28 insertions, 0 deletions
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 9af70e4..241ce7f 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -23,6 +23,30 @@ Examples from clustering stage (from a sample of 100k records):
["Editorial Board & Publication Information",2]
...
+WIP:
+
+ {
+ "miss.blacklisted": 620,
+ "miss.contrib_intersection_empty": 3256,
+ "miss.dataset_doi": 8725,
+ "miss.num_diff": 14914,
+ "miss.release_type": 14352,
+ "miss.short_title": 3321,
+ "miss.vhs": 45,
+ "miss.year": 12385,
+ "ok.arxiv_version": 13,
+ "ok.dummy": 8874,
+ "ok.preprint_published": 7,
+ "ok.slug_title_author_match": 526,
+ "ok.title_author_match": 6187,
+ "skip.container_name_blacklist": 71,
+ "skip.publisher_blacklist": 22,
+ "skip.too_large": 1077,
+ "skip.unique": 830449,
+ "total": 904844
+ }
+
+
"""
import collections
@@ -41,6 +65,10 @@ get_key_values = operator.itemgetter("k", "v")
TITLE_BLACKLIST = set([
"annual meeting",
"an invitation to membership",
+ "appendix d.",
+ "appendix d",
+ "abstracts of papers from other journals",
+ "an epitome of current medical literature",
"",
":{unav)",
"[others]",