verify: page count parsing and comparison improvements

author: Bryan Newbold <bnewbold@archive.org> 2021-07-01 16:11:06 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-07-01 16:11:06 -0700
commit: b625155d565367141f7fbe0d5e507b9dc98ce4df (patch)
tree: c1c7e95da0e48b24fb6715ab85b0bdcbd026fc7d
parent: 0d5535742786fe78f6509b6606ca381912ed8bc7 (diff)
download: fuzzycat-b625155d565367141f7fbe0d5e507b9dc98ce4df.tar.gz
fuzzycat-b625155d565367141f7fbe0d5e507b9dc98ce4df.zip
3 files changed, 25 insertions, 6 deletions
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index d37ee32..bdca7b6 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -35,20 +35,32 @@ def es_compat_hits_total(resp):
 
 def parse_page_string(s):
     """
-    Parse typical page strings, e.g. 150-180.
+    Parse typical page strings, e.g. 150-180 or p123.
+
+    If only a single page number is found, returns that first page and None for
+    end page and count. If two are found, and they are consistent as a range,
+    returns the start, end, and count.
+
+    Does not handle lists of page numbers, roman numerals, and several other
+    patterns.
     """
     if not s:
         raise ValueError('page parsing: empty string')
+    if s[0].lower() in ('p', 'e'):
+        s = s[1:]
     if s.isnumeric():
-        return ParsedPages(start=int(s), end=int(s), count=1)
+        return ParsedPages(start=int(s), end=None, count=None)
     page_pattern = re.compile("([0-9]{1,})-([0-9]{1,})")
     match = page_pattern.match(s)
     if not match:
         raise ValueError('cannot parse page pattern from {}'.format(s))
     start, end = match.groups()
     if len(end) == 1 and start and start[-1] < end:
-        # 261-5, odd, but happens
+        # '261-5', odd, but happens
         end = start[:-1] + end
+    elif len(end) == 2 and start and start[-2:] < end:
+        # '577-89', also happens
+        end = start[:-2] + end
     a, b = int(start), int(end)
     if a > b:
         raise ValueError('invalid page range: {}'.format(s))
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 45a809e..f32121d 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -597,7 +597,9 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]:
     try:
         a_parsed_pages = parse_page_string(glom(a, "pages"))
         b_parsed_pages = parse_page_string(glom(b, "pages"))
-        if abs(a_parsed_pages.count - b_parsed_pages.count) > 5:
+        if (a_parsed_pages.count != None
+                and b_parsed_pages.count != None
+                and abs(a_parsed_pages.count - b_parsed_pages.count) > 5):
             return Verify(Status.DIFFERENT, Reason.PAGE_COUNT)
     except (ValueError, PathAccessError):
         pass
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 381c44e..d0bcfc1 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -77,15 +77,20 @@ def test_dict_key_exists():
 
 
 def test_page_page_string():
-    reject = ("", "123-2", "123-120", "123a-124", "-2-1")
+    reject = ("", "123-2", "123-120", "123a-124", "-2-1", "I-II", "xv-xvi", "p")
     for s in reject:
         with pytest.raises(ValueError):
             assert parse_page_string(s)
-    assert parse_page_string("123") == (123, 123, 1)
+    assert parse_page_string("123") == (123, None, None)
+    assert parse_page_string("90-90") == (90, 90, 1)
     assert parse_page_string("123-5") == (123, 125, 3)
     assert parse_page_string("123-125") == (123, 125, 3)
     assert parse_page_string("123-124a") == (123, 124, 2)
     assert parse_page_string("1-1000") == (1, 1000, 1000)
+    assert parse_page_string("p55") == (55, None, None)
+    assert parse_page_string("p55-65") == (55, 65, 11)
+    assert parse_page_string("e1234") == (1234, None, None)
+    assert parse_page_string("577-89") == (577, 589, 13)
 
 
 def test_zstdlines():
author	Bryan Newbold <bnewbold@archive.org>	2021-07-01 16:11:06 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-07-01 16:11:06 -0700
commit	b625155d565367141f7fbe0d5e507b9dc98ce4df (patch)
tree	c1c7e95da0e48b24fb6715ab85b0bdcbd026fc7d
parent	0d5535742786fe78f6509b6606ca381912ed8bc7 (diff)
download	fuzzycat-b625155d565367141f7fbe0d5e507b9dc98ce4df.tar.gz fuzzycat-b625155d565367141f7fbe0d5e507b9dc98ce4df.zip