aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-07-01 16:11:06 -0700
committerBryan Newbold <bnewbold@archive.org>2021-07-01 16:11:06 -0700
commitb625155d565367141f7fbe0d5e507b9dc98ce4df (patch)
treec1c7e95da0e48b24fb6715ab85b0bdcbd026fc7d
parent0d5535742786fe78f6509b6606ca381912ed8bc7 (diff)
downloadfuzzycat-b625155d565367141f7fbe0d5e507b9dc98ce4df.tar.gz
fuzzycat-b625155d565367141f7fbe0d5e507b9dc98ce4df.zip
verify: page count parsing and comparison improvements
-rw-r--r--fuzzycat/utils.py18
-rw-r--r--fuzzycat/verify.py4
-rw-r--r--tests/test_utils.py9
3 files changed, 25 insertions, 6 deletions
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index d37ee32..bdca7b6 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -35,20 +35,32 @@ def es_compat_hits_total(resp):
def parse_page_string(s):
"""
- Parse typical page strings, e.g. 150-180.
+ Parse typical page strings, e.g. 150-180 or p123.
+
+ If only a single page number is found, returns that first page and None for
+ end page and count. If two are found, and they are consistent as a range,
+ returns the start, end, and count.
+
+ Does not handle lists of page numbers, roman numerals, and several other
+ patterns.
"""
if not s:
raise ValueError('page parsing: empty string')
+ if s[0].lower() in ('p', 'e'):
+ s = s[1:]
if s.isnumeric():
- return ParsedPages(start=int(s), end=int(s), count=1)
+ return ParsedPages(start=int(s), end=None, count=None)
page_pattern = re.compile("([0-9]{1,})-([0-9]{1,})")
match = page_pattern.match(s)
if not match:
raise ValueError('cannot parse page pattern from {}'.format(s))
start, end = match.groups()
if len(end) == 1 and start and start[-1] < end:
- # 261-5, odd, but happens
+ # '261-5', odd, but happens
end = start[:-1] + end
+ elif len(end) == 2 and start and start[-2:] < end:
+ # '577-89', also happens
+ end = start[:-2] + end
a, b = int(start), int(end)
if a > b:
raise ValueError('invalid page range: {}'.format(s))
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 45a809e..f32121d 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -597,7 +597,9 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]:
try:
a_parsed_pages = parse_page_string(glom(a, "pages"))
b_parsed_pages = parse_page_string(glom(b, "pages"))
- if abs(a_parsed_pages.count - b_parsed_pages.count) > 5:
+ if (a_parsed_pages.count != None
+ and b_parsed_pages.count != None
+ and abs(a_parsed_pages.count - b_parsed_pages.count) > 5):
return Verify(Status.DIFFERENT, Reason.PAGE_COUNT)
except (ValueError, PathAccessError):
pass
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 381c44e..d0bcfc1 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -77,15 +77,20 @@ def test_dict_key_exists():
def test_page_page_string():
- reject = ("", "123-2", "123-120", "123a-124", "-2-1")
+ reject = ("", "123-2", "123-120", "123a-124", "-2-1", "I-II", "xv-xvi", "p")
for s in reject:
with pytest.raises(ValueError):
assert parse_page_string(s)
- assert parse_page_string("123") == (123, 123, 1)
+ assert parse_page_string("123") == (123, None, None)
+ assert parse_page_string("90-90") == (90, 90, 1)
assert parse_page_string("123-5") == (123, 125, 3)
assert parse_page_string("123-125") == (123, 125, 3)
assert parse_page_string("123-124a") == (123, 124, 2)
assert parse_page_string("1-1000") == (1, 1000, 1000)
+ assert parse_page_string("p55") == (55, None, None)
+ assert parse_page_string("p55-65") == (55, 65, 11)
+ assert parse_page_string("e1234") == (1234, None, None)
+ assert parse_page_string("577-89") == (577, 589, 13)
def test_zstdlines():