From f67e14fdb7ab6cad06b36a532e51eb309001a66f Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 25 Nov 2020 17:40:21 +0100 Subject: move helpers to utils --- fuzzycat/utils.py | 23 +++++++++++++++++++++++ fuzzycat/verify.py | 25 ++----------------------- tests/test_utils.py | 4 ++-- tests/test_verify.py | 5 +++-- 4 files changed, 30 insertions(+), 27 deletions(-) diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py index 7a8f067..4d1325d 100644 --- a/fuzzycat/utils.py +++ b/fuzzycat/utils.py @@ -1,9 +1,13 @@ import io import itertools +import re import string printable_no_punct = string.digits + string.ascii_letters + string.whitespace +# More correct: https://www.johndcook.com/blog/2016/02/04/regular-expression-to-match-a-chemical-element/ +CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+") + def slugify_string(s: str) -> str: """ @@ -67,3 +71,22 @@ def nwise(iterable, n=2): while piece: yield piece piece = tuple(itertools.islice(i, n)) + + +def num_project(s): + """ + Cf. https://fatcat.wiki/release/6b5yupd7bfcw7gp73hjoavbgfq, + https://fatcat.wiki/release/7hgzqz3hrngq7omtwdxz4qx34u + + Unify every occurence of a digit (or group of digits). + """ + return re.sub(r'\d+', '', s) + + +def contains_chemical_formula(s): + """ + Returns true, if we find C3H8O or the like in title. + """ + for token in s.split(): + if CHEM_FORMULA.search(token): + return True diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 0fb9358..ab26603 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -66,15 +66,13 @@ import re import sys from fuzzycat.common import OK, Miss, Status -from fuzzycat.utils import author_similarity_score, slugify_string +from fuzzycat.utils import (author_similarity_score, contains_chemical_formula, num_project, + slugify_string) # The result of clustering are documents that have a key k and a list of values # (of the cluster) v. get_key_values = operator.itemgetter("k", "v") -# More correct: https://www.johndcook.com/blog/2016/02/04/regular-expression-to-match-a-chemical-element/ -CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+") - class GroupVerifier: """ @@ -319,25 +317,6 @@ def compare(a, b): return (Status.AMBIGUOUS, OK.DUMMY) -def num_project(s): - """ - Cf. https://fatcat.wiki/release/6b5yupd7bfcw7gp73hjoavbgfq, - https://fatcat.wiki/release/7hgzqz3hrngq7omtwdxz4qx34u - - Unify every occurence of a digit (or group of digits). - """ - return re.sub(r'\d+', '', s) - - -def contains_chemical_formula(s): - """ - Returns true, if we find C3H8O or the like in title. - """ - for token in s.split(): - if CHEM_FORMULA.search(token): - return True - - TITLE_FRAGMENT_BLACKLIST = set([ "air quality data from the life+respira project in pamplona", "animaux vivants exclus ceux de la division", diff --git a/tests/test_utils.py b/tests/test_utils.py index bc0d918..9357fe8 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,7 @@ import pytest -from fuzzycat.utils import author_similarity_score, cut, slugify_string, jaccard, token_n_grams, tokenize_string, nwise +from fuzzycat.utils import (author_similarity_score, cut, jaccard, nwise, slugify_string, + token_n_grams, tokenize_string) def test_slugify_string(): @@ -62,4 +63,3 @@ def test_nwise(): assert list(nwise("1234")) == [("1", "2"), ("3", "4")] assert list(nwise("1234", n=1)) == [("1", ), ("2", ), ("3", ), ("4", )] assert list(nwise([1, 2, 3, 4, 5], n=3)) == [(1, 2, 3), (4, 5)] - diff --git a/tests/test_verify.py b/tests/test_verify.py index 75e0277..90a22b5 100644 --- a/tests/test_verify.py +++ b/tests/test_verify.py @@ -1,8 +1,9 @@ -import pytest import csv import json -import os import logging +import os + +import pytest from fuzzycat.verify import Status, compare -- cgit v1.2.3