aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-25 17:40:21 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-25 17:40:21 +0100
commitf67e14fdb7ab6cad06b36a532e51eb309001a66f (patch)
treea5d2fa390f9a8b992adc185867d8e734ff2c9720
parent4a9633f0f989f4103a5c35721c5984e21a5d2192 (diff)
downloadfuzzycat-f67e14fdb7ab6cad06b36a532e51eb309001a66f.tar.gz
fuzzycat-f67e14fdb7ab6cad06b36a532e51eb309001a66f.zip
move helpers to utils
-rw-r--r--fuzzycat/utils.py23
-rw-r--r--fuzzycat/verify.py25
-rw-r--r--tests/test_utils.py4
-rw-r--r--tests/test_verify.py5
4 files changed, 30 insertions, 27 deletions
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index 7a8f067..4d1325d 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -1,9 +1,13 @@
import io
import itertools
+import re
import string
printable_no_punct = string.digits + string.ascii_letters + string.whitespace
+# More correct: https://www.johndcook.com/blog/2016/02/04/regular-expression-to-match-a-chemical-element/
+CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+")
+
def slugify_string(s: str) -> str:
"""
@@ -67,3 +71,22 @@ def nwise(iterable, n=2):
while piece:
yield piece
piece = tuple(itertools.islice(i, n))
+
+
+def num_project(s):
+ """
+ Cf. https://fatcat.wiki/release/6b5yupd7bfcw7gp73hjoavbgfq,
+ https://fatcat.wiki/release/7hgzqz3hrngq7omtwdxz4qx34u
+
+ Unify every occurence of a digit (or group of digits).
+ """
+ return re.sub(r'\d+', '<NUM>', s)
+
+
+def contains_chemical_formula(s):
+ """
+ Returns true, if we find C3H8O or the like in title.
+ """
+ for token in s.split():
+ if CHEM_FORMULA.search(token):
+ return True
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 0fb9358..ab26603 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -66,15 +66,13 @@ import re
import sys
from fuzzycat.common import OK, Miss, Status
-from fuzzycat.utils import author_similarity_score, slugify_string
+from fuzzycat.utils import (author_similarity_score, contains_chemical_formula, num_project,
+ slugify_string)
# The result of clustering are documents that have a key k and a list of values
# (of the cluster) v.
get_key_values = operator.itemgetter("k", "v")
-# More correct: https://www.johndcook.com/blog/2016/02/04/regular-expression-to-match-a-chemical-element/
-CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+")
-
class GroupVerifier:
"""
@@ -319,25 +317,6 @@ def compare(a, b):
return (Status.AMBIGUOUS, OK.DUMMY)
-def num_project(s):
- """
- Cf. https://fatcat.wiki/release/6b5yupd7bfcw7gp73hjoavbgfq,
- https://fatcat.wiki/release/7hgzqz3hrngq7omtwdxz4qx34u
-
- Unify every occurence of a digit (or group of digits).
- """
- return re.sub(r'\d+', '<NUM>', s)
-
-
-def contains_chemical_formula(s):
- """
- Returns true, if we find C3H8O or the like in title.
- """
- for token in s.split():
- if CHEM_FORMULA.search(token):
- return True
-
-
TITLE_FRAGMENT_BLACKLIST = set([
"air quality data from the life+respira project in pamplona",
"animaux vivants exclus ceux de la division",
diff --git a/tests/test_utils.py b/tests/test_utils.py
index bc0d918..9357fe8 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,6 +1,7 @@
import pytest
-from fuzzycat.utils import author_similarity_score, cut, slugify_string, jaccard, token_n_grams, tokenize_string, nwise
+from fuzzycat.utils import (author_similarity_score, cut, jaccard, nwise, slugify_string,
+ token_n_grams, tokenize_string)
def test_slugify_string():
@@ -62,4 +63,3 @@ def test_nwise():
assert list(nwise("1234")) == [("1", "2"), ("3", "4")]
assert list(nwise("1234", n=1)) == [("1", ), ("2", ), ("3", ), ("4", )]
assert list(nwise([1, 2, 3, 4, 5], n=3)) == [(1, 2, 3), (4, 5)]
-
diff --git a/tests/test_verify.py b/tests/test_verify.py
index 75e0277..90a22b5 100644
--- a/tests/test_verify.py
+++ b/tests/test_verify.py
@@ -1,8 +1,9 @@
-import pytest
import csv
import json
-import os
import logging
+import os
+
+import pytest
from fuzzycat.verify import Status, compare