verify: move out some code to utils

author: Martin Czygan <martin.czygan@gmail.com> 2020-12-14 20:31:54 +0100
committer: Martin Czygan <martin.czygan@gmail.com> 2020-12-14 20:31:54 +0100
commit: 54a5fc7f3fa6893a83143a1755aa6b4497efa33c (patch)
tree: cdce9081d3e4410e757c7a95d57bd8509e7cf57c
parent: d3891ff1242627464e7e0eee68ab07a61c0678d4 (diff)
download: fuzzycat-54a5fc7f3fa6893a83143a1755aa6b4497efa33c.tar.gz
fuzzycat-54a5fc7f3fa6893a83143a1755aa6b4497efa33c.zip
4 files changed, 49 insertions, 20 deletions
diff --git a/fuzzycat/data.py b/fuzzycat/data.py
index 6a3cc27..9612065 100644
--- a/fuzzycat/data.py
+++ b/fuzzycat/data.py
@@ -1,5 +1,5 @@
 """
-Static assets, e.g. blacklists.
+Static assets, e.g. whilelists, blacklists, etc.
 """
 
 __all__ = [
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index 7f08fa3..682f912 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -1,3 +1,4 @@
+import collections
 import io
 import itertools
 import re
@@ -10,15 +11,34 @@ printable_no_punct = string.digits + string.ascii_letters + string.whitespace
 # More correct: https://www.johndcook.com/blog/2016/02/04/regular-expression-to-match-a-chemical-element/
 CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+")
 
+ParsedPages = collections.namedtuple("ParsedPages", "start end count")
+
 def parse_page_string(s):
     """
     Parse typical page strings, e.g. 150-180.
     """
-    raise NotImplementedError()
+    if not s:
+        raise ValueError('page parsing: empty string')
+    if s.isnumeric():
+        return ParsedPages(start=int(s), end=int(s), count=1)
+    page_pattern = re.compile("([0-9]{1,})-([0-9]{1,})")
+    match = page_pattern.match(s)
+    if not match:
+        raise ValueError('cannot parse page pattern from {}'.format(s))
+    start, end = match.groups()
+    if len(end) == 1 and start and start[-1] < end:
+        # 261-5, odd, but happens
+        end = start[:-1] + end
+    a, b = int(start), int(end)
+    if a > b:
+        raise ValueError('invalid page range: {}'.format(s))
+    count = b - a + 1
+    return ParsedPages(start=a, end=b, count=count)
 
 def dict_key_exists(doc, path):
     """
-    Return true, if key at a given path exists. XXX: probably already in glom.
+    Return true, if key in a dictionary at a given path exists. XXX: probably
+    already in glom.
     """
     try:
         _ = glom(doc, path)
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index bd8e69b..9fcd62e 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -90,7 +90,7 @@ from fuzzycat.common import Reason, Status
 from fuzzycat.data import (CONTAINER_NAME_BLACKLIST, PUBLISHER_BLACKLIST, TITLE_BLACKLIST,
                            TITLE_FRAGMENT_BLACKLIST)
 from fuzzycat.utils import (author_similarity_score, contains_chemical_formula, dict_key_exists,
-                            has_doi_prefix, jaccard, num_project, slugify_string)
+                            has_doi_prefix, jaccard, num_project, slugify_string, parse_page_string)
 
 
 class GroupVerifier:
@@ -577,21 +577,11 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]:
     # https://fatcat.wiki/release/tm3gaiumkvb3xc7t3i6suna6u4
     # https://fatcat.wiki/release/r6dj63wh3zcrrolisn6xuacnve
     try:
-        a_pages = glom(a, "pages")
-        b_pages = glom(b, "pages")
-        # XXX: Pages might be of the form "261-5", meaning: 261-265.
-        page_pattern = re.compile("([0-9]{1,})-([0-9]{1,})")
-        a_match = page_pattern.match(a_pages)
-        b_match = page_pattern.match(b_pages)
-        if a_match and b_match:
-            a_start, a_end = a_match.groups()
-            b_start, b_end = b_match.groups()
-            a_num_pages = int(a_end) - int(a_start)
-            b_num_pages = int(b_end) - int(b_start)
-            if a_num_pages >= 0 and b_num_pages >= 0:
-                if abs(a_num_pages - b_num_pages) > 5:
-                    return (Status.DIFFERENT, Reason.PAGE_COUNT)
-    except PathAccessError:
+        a_parsed_pages = parse_page_string(glom(a, "pages"))
+        b_parsed_pages = parse_page_string(glom(b, "pages"))
+        if abs(a_parsed_pages.count - b_parsed_pages.count) > 5:
+            return (Status.DIFFERENT, Reason.PAGE_COUNT)
+    except (ValueError, PathAccessError):
         pass
 
     return (Status.AMBIGUOUS, Reason.DUMMY)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index a2033ac..38d50a7 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,7 +1,7 @@
 import pytest
 
 from fuzzycat.utils import (author_similarity_score, cut, jaccard, nwise, slugify_string,
-                            token_n_grams, tokenize_string)
+                            token_n_grams, tokenize_string, parse_page_string, dict_key_exists)
 
 
 def test_slugify_string():
@@ -63,3 +63,22 @@ def test_nwise():
     assert list(nwise("1234")) == [("1", "2"), ("3", "4")]
     assert list(nwise("1234", n=1)) == [("1", ), ("2", ), ("3", ), ("4", )]
     assert list(nwise([1, 2, 3, 4, 5], n=3)) == [(1, 2, 3), (4, 5)]
+
+def test_dict_key_exists():
+    assert dict_key_exists({}, "") is False
+    assert dict_key_exists({"a": "a"}, "a") == True
+    assert dict_key_exists({"a": "a"}, "b") == False
+    assert dict_key_exists({"a": {"b": "c"}}, "a.b") == True
+    assert dict_key_exists({"a": {"b": None}}, "a.b") == True
+    assert dict_key_exists({"a": {"b": "c"}}, "a.b.c") == False
+
+def test_page_page_string():
+    reject = ("", "123-2", "123-120", "123a-124", "-2-1")
+    for s in reject:
+        with pytest.raises(ValueError):
+            assert parse_page_string(s)
+    assert parse_page_string("123") == (123, 123, 1)
+    assert parse_page_string("123-5") == (123, 125, 3)
+    assert parse_page_string("123-125") == (123, 125, 3)
+    assert parse_page_string("123-124a") == (123, 124, 2)
+    assert parse_page_string("1-1000") == (1, 1000, 1000)
author	Martin Czygan <martin.czygan@gmail.com>	2020-12-14 20:31:54 +0100
committer	Martin Czygan <martin.czygan@gmail.com>	2020-12-14 20:31:54 +0100
commit	54a5fc7f3fa6893a83143a1755aa6b4497efa33c (patch)
tree	cdce9081d3e4410e757c7a95d57bd8509e7cf57c
parent	d3891ff1242627464e7e0eee68ab07a61c0678d4 (diff)
download	fuzzycat-54a5fc7f3fa6893a83143a1755aa6b4497efa33c.tar.gz fuzzycat-54a5fc7f3fa6893a83143a1755aa6b4497efa33c.zip