diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-07-01 16:11:06 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-07-01 16:11:06 -0700 |
commit | b625155d565367141f7fbe0d5e507b9dc98ce4df (patch) | |
tree | c1c7e95da0e48b24fb6715ab85b0bdcbd026fc7d /fuzzycat | |
parent | 0d5535742786fe78f6509b6606ca381912ed8bc7 (diff) | |
download | fuzzycat-b625155d565367141f7fbe0d5e507b9dc98ce4df.tar.gz fuzzycat-b625155d565367141f7fbe0d5e507b9dc98ce4df.zip |
verify: page count parsing and comparison improvements
Diffstat (limited to 'fuzzycat')
-rw-r--r-- | fuzzycat/utils.py | 18 | ||||
-rw-r--r-- | fuzzycat/verify.py | 4 |
2 files changed, 18 insertions, 4 deletions
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py index d37ee32..bdca7b6 100644 --- a/fuzzycat/utils.py +++ b/fuzzycat/utils.py @@ -35,20 +35,32 @@ def es_compat_hits_total(resp): def parse_page_string(s): """ - Parse typical page strings, e.g. 150-180. + Parse typical page strings, e.g. 150-180 or p123. + + If only a single page number is found, returns that first page and None for + end page and count. If two are found, and they are consistent as a range, + returns the start, end, and count. + + Does not handle lists of page numbers, roman numerals, and several other + patterns. """ if not s: raise ValueError('page parsing: empty string') + if s[0].lower() in ('p', 'e'): + s = s[1:] if s.isnumeric(): - return ParsedPages(start=int(s), end=int(s), count=1) + return ParsedPages(start=int(s), end=None, count=None) page_pattern = re.compile("([0-9]{1,})-([0-9]{1,})") match = page_pattern.match(s) if not match: raise ValueError('cannot parse page pattern from {}'.format(s)) start, end = match.groups() if len(end) == 1 and start and start[-1] < end: - # 261-5, odd, but happens + # '261-5', odd, but happens end = start[:-1] + end + elif len(end) == 2 and start and start[-2:] < end: + # '577-89', also happens + end = start[:-2] + end a, b = int(start), int(end) if a > b: raise ValueError('invalid page range: {}'.format(s)) diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 45a809e..f32121d 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -597,7 +597,9 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]: try: a_parsed_pages = parse_page_string(glom(a, "pages")) b_parsed_pages = parse_page_string(glom(b, "pages")) - if abs(a_parsed_pages.count - b_parsed_pages.count) > 5: + if (a_parsed_pages.count != None + and b_parsed_pages.count != None + and abs(a_parsed_pages.count - b_parsed_pages.count) > 5): return Verify(Status.DIFFERENT, Reason.PAGE_COUNT) except (ValueError, PathAccessError): pass |