From 4e4422aef7806b311e10bd99589a347e2c4796ab Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 17 Nov 2020 03:08:52 +0100 Subject: update notes --- fuzzycat/verify.py | 66 ++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 52 insertions(+), 14 deletions(-) diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 241ce7f..704b034 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -26,18 +26,19 @@ Examples from clustering stage (from a sample of 100k records): WIP: { - "miss.blacklisted": 620, - "miss.contrib_intersection_empty": 3256, - "miss.dataset_doi": 8725, + "miss.blacklisted": 926, + "miss.contrib_intersection_empty": 3218, + "miss.dataset_doi": 8723, "miss.num_diff": 14914, - "miss.release_type": 14352, - "miss.short_title": 3321, + "miss.release_type": 14305, + "miss.short_title": 3315, + "miss.subtitle": 102, "miss.vhs": 45, - "miss.year": 12385, + "miss.year": 12332, "ok.arxiv_version": 13, - "ok.dummy": 8874, + "ok.dummy": 8640, "ok.preprint_published": 7, - "ok.slug_title_author_match": 526, + "ok.slug_title_author_match": 498, "ok.title_author_match": 6187, "skip.container_name_blacklist": 71, "skip.publisher_blacklist": 22, @@ -63,12 +64,18 @@ get_key_values = operator.itemgetter("k", "v") # There titles appear too often, so ignore them for now. TITLE_BLACKLIST = set([ - "annual meeting", - "an invitation to membership", - "appendix d.", - "appendix d", - "abstracts of papers from other journals", - "an epitome of current medical literature", + "actualités professionnelles", + "association notes", + "addenda", + "beyond the flyleaf", + "schlussbemerkung", + "editors/ editorial board", + "conference report", + "editorial board and publication information", + "front & back matter", + "abstract withdrawn", + "briefs", + "proceedings of societies", "", ":{unav)", "[others]", @@ -84,6 +91,7 @@ TITLE_BLACKLIST = set([ "about this journal", "about this title", "abréviations", + "abstracts of papers from other journals", "abstracts of papers to appear in future issues", "abstracts", "acknowledgement of reviewers", @@ -93,13 +101,20 @@ TITLE_BLACKLIST = set([ "acknowledgments", "actualités", "agradecimento", + "agradecimientos", "all pdfs of this category", + "an epitome of current medical literature", + "an invitation to membership", "announcement", "announcements", + "annual meeting", "annual report", + "appendix d", + "appendix d.", "around the world", "arthrobacter sp.", "aufgaben", + "ausgewählte literatur", "author index", "author response image 1. author response", "back matter", @@ -195,6 +210,9 @@ PUBLISHER_BLACKLIST = set([ "test accounts", ]) +# More correct: https://www.johndcook.com/blog/2016/02/04/regular-expression-to-match-a-chemical-element/ +CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+") + class Status(str, Enum): """ @@ -231,6 +249,8 @@ class Miss(str, Enum): NUM_DIFF = 'miss.num_diff' DATASET_DOI = 'miss.dataset_doi' RELEASE_TYPE = 'miss.release_type' + CHEM_FORMULA = 'miss.chem_formula' + SUBTITLE = 'miss.subtitle' class GroupVerifier: """ @@ -325,6 +345,15 @@ def compare(a, b): a_slug_title = slugify_string(a.get("title", "")).replace("\n", " ") b_slug_title = slugify_string(b.get("title", "")).replace("\n", " ") + if a_slug_title == b_slug_title: + for a_sub in a.get("subtitle", []): + for b_sub in a.get("subtitle", []): + if slugify_string(a_sub) != slugify_string(b_sub): + return (Status.DIFFERENT, Miss.SUBTITLE) + + if contains_chemical_formula(a_slug_title) or contains_chemical_formula(b_slug_title) and (a_slug_title != b_slug_title): + return (Status.DIFFERENT, Miss.CHEM_FORMULA) + if len(a_slug_title) < 10 and a_slug_title != b_slug_title: return (Status.AMBIGUOUS, Miss.SHORT_TITLE) @@ -366,3 +395,12 @@ def num_project(s): """ return re.sub('\d+', '', s) +def contains_chemical_formula(s): + """ + Returns true, if we find C3H8O or the like in title. + """ + for token in s.split(): + if CHEM_FORMULA.search(token): + return True + return False + -- cgit v1.2.3