aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fuzzycat/data.py2
-rw-r--r--fuzzycat/utils.py24
-rw-r--r--fuzzycat/verify.py22
-rw-r--r--tests/test_utils.py21
4 files changed, 49 insertions, 20 deletions
diff --git a/fuzzycat/data.py b/fuzzycat/data.py
index 6a3cc27..9612065 100644
--- a/fuzzycat/data.py
+++ b/fuzzycat/data.py
@@ -1,5 +1,5 @@
"""
-Static assets, e.g. blacklists.
+Static assets, e.g. whilelists, blacklists, etc.
"""
__all__ = [
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index 7f08fa3..682f912 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -1,3 +1,4 @@
+import collections
import io
import itertools
import re
@@ -10,15 +11,34 @@ printable_no_punct = string.digits + string.ascii_letters + string.whitespace
# More correct: https://www.johndcook.com/blog/2016/02/04/regular-expression-to-match-a-chemical-element/
CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+")
+ParsedPages = collections.namedtuple("ParsedPages", "start end count")
+
def parse_page_string(s):
"""
Parse typical page strings, e.g. 150-180.
"""
- raise NotImplementedError()
+ if not s:
+ raise ValueError('page parsing: empty string')
+ if s.isnumeric():
+ return ParsedPages(start=int(s), end=int(s), count=1)
+ page_pattern = re.compile("([0-9]{1,})-([0-9]{1,})")
+ match = page_pattern.match(s)
+ if not match:
+ raise ValueError('cannot parse page pattern from {}'.format(s))
+ start, end = match.groups()
+ if len(end) == 1 and start and start[-1] < end:
+ # 261-5, odd, but happens
+ end = start[:-1] + end
+ a, b = int(start), int(end)
+ if a > b:
+ raise ValueError('invalid page range: {}'.format(s))
+ count = b - a + 1
+ return ParsedPages(start=a, end=b, count=count)
def dict_key_exists(doc, path):
"""
- Return true, if key at a given path exists. XXX: probably already in glom.
+ Return true, if key in a dictionary at a given path exists. XXX: probably
+ already in glom.
"""
try:
_ = glom(doc, path)
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index bd8e69b..9fcd62e 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -90,7 +90,7 @@ from fuzzycat.common import Reason, Status
from fuzzycat.data import (CONTAINER_NAME_BLACKLIST, PUBLISHER_BLACKLIST, TITLE_BLACKLIST,
TITLE_FRAGMENT_BLACKLIST)
from fuzzycat.utils import (author_similarity_score, contains_chemical_formula, dict_key_exists,
- has_doi_prefix, jaccard, num_project, slugify_string)
+ has_doi_prefix, jaccard, num_project, slugify_string, parse_page_string)
class GroupVerifier:
@@ -577,21 +577,11 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]:
# https://fatcat.wiki/release/tm3gaiumkvb3xc7t3i6suna6u4
# https://fatcat.wiki/release/r6dj63wh3zcrrolisn6xuacnve
try:
- a_pages = glom(a, "pages")
- b_pages = glom(b, "pages")
- # XXX: Pages might be of the form "261-5", meaning: 261-265.
- page_pattern = re.compile("([0-9]{1,})-([0-9]{1,})")
- a_match = page_pattern.match(a_pages)
- b_match = page_pattern.match(b_pages)
- if a_match and b_match:
- a_start, a_end = a_match.groups()
- b_start, b_end = b_match.groups()
- a_num_pages = int(a_end) - int(a_start)
- b_num_pages = int(b_end) - int(b_start)
- if a_num_pages >= 0 and b_num_pages >= 0:
- if abs(a_num_pages - b_num_pages) > 5:
- return (Status.DIFFERENT, Reason.PAGE_COUNT)
- except PathAccessError:
+ a_parsed_pages = parse_page_string(glom(a, "pages"))
+ b_parsed_pages = parse_page_string(glom(b, "pages"))
+ if abs(a_parsed_pages.count - b_parsed_pages.count) > 5:
+ return (Status.DIFFERENT, Reason.PAGE_COUNT)
+ except (ValueError, PathAccessError):
pass
return (Status.AMBIGUOUS, Reason.DUMMY)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index a2033ac..38d50a7 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,7 +1,7 @@
import pytest
from fuzzycat.utils import (author_similarity_score, cut, jaccard, nwise, slugify_string,
- token_n_grams, tokenize_string)
+ token_n_grams, tokenize_string, parse_page_string, dict_key_exists)
def test_slugify_string():
@@ -63,3 +63,22 @@ def test_nwise():
assert list(nwise("1234")) == [("1", "2"), ("3", "4")]
assert list(nwise("1234", n=1)) == [("1", ), ("2", ), ("3", ), ("4", )]
assert list(nwise([1, 2, 3, 4, 5], n=3)) == [(1, 2, 3), (4, 5)]
+
+def test_dict_key_exists():
+ assert dict_key_exists({}, "") is False
+ assert dict_key_exists({"a": "a"}, "a") == True
+ assert dict_key_exists({"a": "a"}, "b") == False
+ assert dict_key_exists({"a": {"b": "c"}}, "a.b") == True
+ assert dict_key_exists({"a": {"b": None}}, "a.b") == True
+ assert dict_key_exists({"a": {"b": "c"}}, "a.b.c") == False
+
+def test_page_page_string():
+ reject = ("", "123-2", "123-120", "123a-124", "-2-1")
+ for s in reject:
+ with pytest.raises(ValueError):
+ assert parse_page_string(s)
+ assert parse_page_string("123") == (123, 123, 1)
+ assert parse_page_string("123-5") == (123, 125, 3)
+ assert parse_page_string("123-125") == (123, 125, 3)
+ assert parse_page_string("123-124a") == (123, 124, 2)
+ assert parse_page_string("1-1000") == (1, 1000, 1000)