diff options
| author | Martin Czygan <martin.czygan@gmail.com> | 2020-11-25 17:40:21 +0100 | 
|---|---|---|
| committer | Martin Czygan <martin.czygan@gmail.com> | 2020-11-25 17:40:21 +0100 | 
| commit | f67e14fdb7ab6cad06b36a532e51eb309001a66f (patch) | |
| tree | a5d2fa390f9a8b992adc185867d8e734ff2c9720 /fuzzycat | |
| parent | 4a9633f0f989f4103a5c35721c5984e21a5d2192 (diff) | |
| download | fuzzycat-f67e14fdb7ab6cad06b36a532e51eb309001a66f.tar.gz fuzzycat-f67e14fdb7ab6cad06b36a532e51eb309001a66f.zip  | |
move helpers to utils
Diffstat (limited to 'fuzzycat')
| -rw-r--r-- | fuzzycat/utils.py | 23 | ||||
| -rw-r--r-- | fuzzycat/verify.py | 25 | 
2 files changed, 25 insertions, 23 deletions
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py index 7a8f067..4d1325d 100644 --- a/fuzzycat/utils.py +++ b/fuzzycat/utils.py @@ -1,9 +1,13 @@  import io  import itertools +import re  import string  printable_no_punct = string.digits + string.ascii_letters + string.whitespace +# More correct: https://www.johndcook.com/blog/2016/02/04/regular-expression-to-match-a-chemical-element/ +CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+") +  def slugify_string(s: str) -> str:      """ @@ -67,3 +71,22 @@ def nwise(iterable, n=2):      while piece:          yield piece          piece = tuple(itertools.islice(i, n)) + + +def num_project(s): +    """ +    Cf. https://fatcat.wiki/release/6b5yupd7bfcw7gp73hjoavbgfq, +    https://fatcat.wiki/release/7hgzqz3hrngq7omtwdxz4qx34u + +    Unify every occurence of a digit (or group of digits). +    """ +    return re.sub(r'\d+', '<NUM>', s) + + +def contains_chemical_formula(s): +    """ +    Returns true, if we find C3H8O or the like in title. +    """ +    for token in s.split(): +        if CHEM_FORMULA.search(token): +            return True diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 0fb9358..ab26603 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -66,15 +66,13 @@ import re  import sys  from fuzzycat.common import OK, Miss, Status -from fuzzycat.utils import author_similarity_score, slugify_string +from fuzzycat.utils import (author_similarity_score, contains_chemical_formula, num_project, +                            slugify_string)  # The result of clustering are documents that have a key k and a list of values  # (of the cluster) v.  get_key_values = operator.itemgetter("k", "v") -# More correct: https://www.johndcook.com/blog/2016/02/04/regular-expression-to-match-a-chemical-element/ -CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+") -  class GroupVerifier:      """ @@ -319,25 +317,6 @@ def compare(a, b):      return (Status.AMBIGUOUS, OK.DUMMY) -def num_project(s): -    """ -    Cf. https://fatcat.wiki/release/6b5yupd7bfcw7gp73hjoavbgfq, -    https://fatcat.wiki/release/7hgzqz3hrngq7omtwdxz4qx34u - -    Unify every occurence of a digit (or group of digits). -    """ -    return re.sub(r'\d+', '<NUM>', s) - - -def contains_chemical_formula(s): -    """ -    Returns true, if we find C3H8O or the like in title. -    """ -    for token in s.split(): -        if CHEM_FORMULA.search(token): -            return True - -  TITLE_FRAGMENT_BLACKLIST = set([      "air quality data from the life+respira project in pamplona",      "animaux vivants exclus ceux de la division",  | 
