From 54a5fc7f3fa6893a83143a1755aa6b4497efa33c Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Mon, 14 Dec 2020 20:31:54 +0100 Subject: verify: move out some code to utils --- fuzzycat/data.py | 2 +- fuzzycat/utils.py | 24 ++++++++++++++++++++++-- fuzzycat/verify.py | 22 ++++++---------------- tests/test_utils.py | 21 ++++++++++++++++++++- 4 files changed, 49 insertions(+), 20 deletions(-) diff --git a/fuzzycat/data.py b/fuzzycat/data.py index 6a3cc27..9612065 100644 --- a/fuzzycat/data.py +++ b/fuzzycat/data.py @@ -1,5 +1,5 @@ """ -Static assets, e.g. blacklists. +Static assets, e.g. whilelists, blacklists, etc. """ __all__ = [ diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py index 7f08fa3..682f912 100644 --- a/fuzzycat/utils.py +++ b/fuzzycat/utils.py @@ -1,3 +1,4 @@ +import collections import io import itertools import re @@ -10,15 +11,34 @@ printable_no_punct = string.digits + string.ascii_letters + string.whitespace # More correct: https://www.johndcook.com/blog/2016/02/04/regular-expression-to-match-a-chemical-element/ CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+") +ParsedPages = collections.namedtuple("ParsedPages", "start end count") + def parse_page_string(s): """ Parse typical page strings, e.g. 150-180. """ - raise NotImplementedError() + if not s: + raise ValueError('page parsing: empty string') + if s.isnumeric(): + return ParsedPages(start=int(s), end=int(s), count=1) + page_pattern = re.compile("([0-9]{1,})-([0-9]{1,})") + match = page_pattern.match(s) + if not match: + raise ValueError('cannot parse page pattern from {}'.format(s)) + start, end = match.groups() + if len(end) == 1 and start and start[-1] < end: + # 261-5, odd, but happens + end = start[:-1] + end + a, b = int(start), int(end) + if a > b: + raise ValueError('invalid page range: {}'.format(s)) + count = b - a + 1 + return ParsedPages(start=a, end=b, count=count) def dict_key_exists(doc, path): """ - Return true, if key at a given path exists. XXX: probably already in glom. + Return true, if key in a dictionary at a given path exists. XXX: probably + already in glom. """ try: _ = glom(doc, path) diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index bd8e69b..9fcd62e 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -90,7 +90,7 @@ from fuzzycat.common import Reason, Status from fuzzycat.data import (CONTAINER_NAME_BLACKLIST, PUBLISHER_BLACKLIST, TITLE_BLACKLIST, TITLE_FRAGMENT_BLACKLIST) from fuzzycat.utils import (author_similarity_score, contains_chemical_formula, dict_key_exists, - has_doi_prefix, jaccard, num_project, slugify_string) + has_doi_prefix, jaccard, num_project, slugify_string, parse_page_string) class GroupVerifier: @@ -577,21 +577,11 @@ def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]: # https://fatcat.wiki/release/tm3gaiumkvb3xc7t3i6suna6u4 # https://fatcat.wiki/release/r6dj63wh3zcrrolisn6xuacnve try: - a_pages = glom(a, "pages") - b_pages = glom(b, "pages") - # XXX: Pages might be of the form "261-5", meaning: 261-265. - page_pattern = re.compile("([0-9]{1,})-([0-9]{1,})") - a_match = page_pattern.match(a_pages) - b_match = page_pattern.match(b_pages) - if a_match and b_match: - a_start, a_end = a_match.groups() - b_start, b_end = b_match.groups() - a_num_pages = int(a_end) - int(a_start) - b_num_pages = int(b_end) - int(b_start) - if a_num_pages >= 0 and b_num_pages >= 0: - if abs(a_num_pages - b_num_pages) > 5: - return (Status.DIFFERENT, Reason.PAGE_COUNT) - except PathAccessError: + a_parsed_pages = parse_page_string(glom(a, "pages")) + b_parsed_pages = parse_page_string(glom(b, "pages")) + if abs(a_parsed_pages.count - b_parsed_pages.count) > 5: + return (Status.DIFFERENT, Reason.PAGE_COUNT) + except (ValueError, PathAccessError): pass return (Status.AMBIGUOUS, Reason.DUMMY) diff --git a/tests/test_utils.py b/tests/test_utils.py index a2033ac..38d50a7 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,7 +1,7 @@ import pytest from fuzzycat.utils import (author_similarity_score, cut, jaccard, nwise, slugify_string, - token_n_grams, tokenize_string) + token_n_grams, tokenize_string, parse_page_string, dict_key_exists) def test_slugify_string(): @@ -63,3 +63,22 @@ def test_nwise(): assert list(nwise("1234")) == [("1", "2"), ("3", "4")] assert list(nwise("1234", n=1)) == [("1", ), ("2", ), ("3", ), ("4", )] assert list(nwise([1, 2, 3, 4, 5], n=3)) == [(1, 2, 3), (4, 5)] + +def test_dict_key_exists(): + assert dict_key_exists({}, "") is False + assert dict_key_exists({"a": "a"}, "a") == True + assert dict_key_exists({"a": "a"}, "b") == False + assert dict_key_exists({"a": {"b": "c"}}, "a.b") == True + assert dict_key_exists({"a": {"b": None}}, "a.b") == True + assert dict_key_exists({"a": {"b": "c"}}, "a.b.c") == False + +def test_page_page_string(): + reject = ("", "123-2", "123-120", "123a-124", "-2-1") + for s in reject: + with pytest.raises(ValueError): + assert parse_page_string(s) + assert parse_page_string("123") == (123, 123, 1) + assert parse_page_string("123-5") == (123, 125, 3) + assert parse_page_string("123-125") == (123, 125, 3) + assert parse_page_string("123-124a") == (123, 124, 2) + assert parse_page_string("1-1000") == (1, 1000, 1000) -- cgit v1.2.3