diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2021-07-01 16:11:06 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2021-07-01 16:11:06 -0700 | 
| commit | b625155d565367141f7fbe0d5e507b9dc98ce4df (patch) | |
| tree | c1c7e95da0e48b24fb6715ab85b0bdcbd026fc7d /fuzzycat | |
| parent | 0d5535742786fe78f6509b6606ca381912ed8bc7 (diff) | |
| download | fuzzycat-b625155d565367141f7fbe0d5e507b9dc98ce4df.tar.gz fuzzycat-b625155d565367141f7fbe0d5e507b9dc98ce4df.zip | |
verify: page count parsing and comparison improvements
Diffstat (limited to 'fuzzycat')
| -rw-r--r-- | fuzzycat/utils.py | 18 | ||||
| -rw-r--r-- | fuzzycat/verify.py | 4 | 
2 files changed, 18 insertions, 4 deletions
| diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py index d37ee32..bdca7b6 100644 --- a/fuzzycat/utils.py +++ b/fuzzycat/utils.py @@ -35,20 +35,32 @@ def es_compat_hits_total(resp):  def parse_page_string(s):      """ -    Parse typical page strings, e.g. 150-180. +    Parse typical page strings, e.g. 150-180 or p123. + +    If only a single page number is found, returns that first page and None for +    end page and count. If two are found, and they are consistent as a range, +    returns the start, end, and count. + +    Does not handle lists of page numbers, roman numerals, and several other +    patterns.      """      if not s:          raise ValueError('page parsing: empty string') +    if s[0].lower() in ('p', 'e'): +        s = s[1:]      if s.isnumeric(): -        return ParsedPages(start=int(s), end=int(s), count=1) +        return ParsedPages(start=int(s), end=None, count=None)      page_pattern = re.compile("([0-9]{1,})-([0-9]{1,})")      match = page_pattern.match(s)      if not match:          raise ValueError('cannot parse page pattern from {}'.format(s))      start, end = match.groups()      if len(end) == 1 and start and start[-1] < end: -        # 261-5, odd, but happens +        # '261-5', odd, but happens          end = start[:-1] + end +    elif len(end) == 2 and start and start[-2:] < end: +        # '577-89', also happens +        end = start[:-2] + end      a, b = int(start), int(end)      if a > b:          raise ValueError('invalid page range: {}'.format(s)) diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 45a809e..f32121d 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -597,7 +597,9 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]:      try:          a_parsed_pages = parse_page_string(glom(a, "pages"))          b_parsed_pages = parse_page_string(glom(b, "pages")) -        if abs(a_parsed_pages.count - b_parsed_pages.count) > 5: +        if (a_parsed_pages.count != None +                and b_parsed_pages.count != None +                and abs(a_parsed_pages.count - b_parsed_pages.count) > 5):              return Verify(Status.DIFFERENT, Reason.PAGE_COUNT)      except (ValueError, PathAccessError):          pass | 
