diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-12-10 03:13:45 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-12-10 03:13:45 +0100 |
commit | 1cc2de7f7349b08d4e807cf5c022ab92c410fe2d (patch) | |
tree | 0aba48f77d8f8b35b214f74d94ce9d4daddc2416 /fuzzycat | |
parent | 7ab933683f05a8de8ec416d520690d86b9a46a16 (diff) | |
download | fuzzycat-1cc2de7f7349b08d4e807cf5c022ab92c410fe2d.tar.gz fuzzycat-1cc2de7f7349b08d4e807cf5c022ab92c410fe2d.zip |
add cases
Diffstat (limited to 'fuzzycat')
-rw-r--r-- | fuzzycat/common.py | 1 | ||||
-rw-r--r-- | fuzzycat/verify.py | 22 |
2 files changed, 23 insertions, 0 deletions
diff --git a/fuzzycat/common.py b/fuzzycat/common.py index 5bf033c..20a5ddd 100644 --- a/fuzzycat/common.py +++ b/fuzzycat/common.py @@ -56,6 +56,7 @@ class Miss(str, Enum): DATASET_DOI = 'miss.dataset_doi' JSTOR_ID = 'miss.jstor_id' NUM_DIFF = 'miss.num_diff' + PAGE_COUNT = 'miss.page_count' RELEASE_TYPE = 'miss.release_type' SHARED_DOI_PREFIX = 'miss.shared_doi_prefix' SHORT_TITLE = 'miss.short_title' diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 88e83d5..76571da 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -322,6 +322,8 @@ def compare(a, b): ]) if len(types & ignore_release_types) == 0: return (Status.DIFFERENT, Miss.RELEASE_TYPE) + if "dataset" in types and ("article" in types or "article-journal" in types): + return (Status.DIFFERENT, Miss.RELEASE_TYPE) except PathAccessError: pass @@ -543,4 +545,24 @@ def compare(a, b): except PathAccessError: pass + # If pages exists, but differ too much, bail out. + # https://fatcat.wiki/release/tm3gaiumkvb3xc7t3i6suna6u4 + # https://fatcat.wiki/release/r6dj63wh3zcrrolisn6xuacnve + try: + a_pages = glom(a, "pages") + b_pages = glom(b, "pages") + page_pattern = re.compile("([0-9]{1,})-([0-9]{1,})") + a_match = page_pattern.match(a_pages) + b_match = page_pattern.match(b_pages) + if a_match and b_match: + a_start, a_end = a_match.groups() + b_start, b_end = b_match.groups() + a_num_pages = int(a_end) - int(a_start) + b_num_pages = int(b_end) - int(b_start) + if a_num_pages >= 0 and b_num_pages >= 0: + if abs(a_num_pages - b_num_pages) > 5: + return (Status.DIFFERENT, Miss.PAGE_COUNT) + except PathAccessError: + pass + return (Status.AMBIGUOUS, OK.DUMMY) |