diff options
| author | Martin Czygan <martin.czygan@gmail.com> | 2020-12-14 20:31:54 +0100 | 
|---|---|---|
| committer | Martin Czygan <martin.czygan@gmail.com> | 2020-12-14 20:31:54 +0100 | 
| commit | 54a5fc7f3fa6893a83143a1755aa6b4497efa33c (patch) | |
| tree | cdce9081d3e4410e757c7a95d57bd8509e7cf57c /fuzzycat | |
| parent | d3891ff1242627464e7e0eee68ab07a61c0678d4 (diff) | |
| download | fuzzycat-54a5fc7f3fa6893a83143a1755aa6b4497efa33c.tar.gz fuzzycat-54a5fc7f3fa6893a83143a1755aa6b4497efa33c.zip  | |
verify: move out some code to utils
Diffstat (limited to 'fuzzycat')
| -rw-r--r-- | fuzzycat/data.py | 2 | ||||
| -rw-r--r-- | fuzzycat/utils.py | 24 | ||||
| -rw-r--r-- | fuzzycat/verify.py | 22 | 
3 files changed, 29 insertions, 19 deletions
diff --git a/fuzzycat/data.py b/fuzzycat/data.py index 6a3cc27..9612065 100644 --- a/fuzzycat/data.py +++ b/fuzzycat/data.py @@ -1,5 +1,5 @@  """ -Static assets, e.g. blacklists. +Static assets, e.g. whilelists, blacklists, etc.  """  __all__ = [ diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py index 7f08fa3..682f912 100644 --- a/fuzzycat/utils.py +++ b/fuzzycat/utils.py @@ -1,3 +1,4 @@ +import collections  import io  import itertools  import re @@ -10,15 +11,34 @@ printable_no_punct = string.digits + string.ascii_letters + string.whitespace  # More correct: https://www.johndcook.com/blog/2016/02/04/regular-expression-to-match-a-chemical-element/  CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+") +ParsedPages = collections.namedtuple("ParsedPages", "start end count") +  def parse_page_string(s):      """      Parse typical page strings, e.g. 150-180.      """ -    raise NotImplementedError() +    if not s: +        raise ValueError('page parsing: empty string') +    if s.isnumeric(): +        return ParsedPages(start=int(s), end=int(s), count=1) +    page_pattern = re.compile("([0-9]{1,})-([0-9]{1,})") +    match = page_pattern.match(s) +    if not match: +        raise ValueError('cannot parse page pattern from {}'.format(s)) +    start, end = match.groups() +    if len(end) == 1 and start and start[-1] < end: +        # 261-5, odd, but happens +        end = start[:-1] + end +    a, b = int(start), int(end) +    if a > b: +        raise ValueError('invalid page range: {}'.format(s)) +    count = b - a + 1 +    return ParsedPages(start=a, end=b, count=count)  def dict_key_exists(doc, path):      """ -    Return true, if key at a given path exists. XXX: probably already in glom. +    Return true, if key in a dictionary at a given path exists. XXX: probably +    already in glom.      """      try:          _ = glom(doc, path) diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index bd8e69b..9fcd62e 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -90,7 +90,7 @@ from fuzzycat.common import Reason, Status  from fuzzycat.data import (CONTAINER_NAME_BLACKLIST, PUBLISHER_BLACKLIST, TITLE_BLACKLIST,                             TITLE_FRAGMENT_BLACKLIST)  from fuzzycat.utils import (author_similarity_score, contains_chemical_formula, dict_key_exists, -                            has_doi_prefix, jaccard, num_project, slugify_string) +                            has_doi_prefix, jaccard, num_project, slugify_string, parse_page_string)  class GroupVerifier: @@ -577,21 +577,11 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]:      # https://fatcat.wiki/release/tm3gaiumkvb3xc7t3i6suna6u4      # https://fatcat.wiki/release/r6dj63wh3zcrrolisn6xuacnve      try: -        a_pages = glom(a, "pages") -        b_pages = glom(b, "pages") -        # XXX: Pages might be of the form "261-5", meaning: 261-265. -        page_pattern = re.compile("([0-9]{1,})-([0-9]{1,})") -        a_match = page_pattern.match(a_pages) -        b_match = page_pattern.match(b_pages) -        if a_match and b_match: -            a_start, a_end = a_match.groups() -            b_start, b_end = b_match.groups() -            a_num_pages = int(a_end) - int(a_start) -            b_num_pages = int(b_end) - int(b_start) -            if a_num_pages >= 0 and b_num_pages >= 0: -                if abs(a_num_pages - b_num_pages) > 5: -                    return (Status.DIFFERENT, Reason.PAGE_COUNT) -    except PathAccessError: +        a_parsed_pages = parse_page_string(glom(a, "pages")) +        b_parsed_pages = parse_page_string(glom(b, "pages")) +        if abs(a_parsed_pages.count - b_parsed_pages.count) > 5: +            return (Status.DIFFERENT, Reason.PAGE_COUNT) +    except (ValueError, PathAccessError):          pass      return (Status.AMBIGUOUS, Reason.DUMMY)  | 
