aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-12-10 03:13:45 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-12-10 03:13:45 +0100
commit1cc2de7f7349b08d4e807cf5c022ab92c410fe2d (patch)
tree0aba48f77d8f8b35b214f74d94ce9d4daddc2416 /fuzzycat
parent7ab933683f05a8de8ec416d520690d86b9a46a16 (diff)
downloadfuzzycat-1cc2de7f7349b08d4e807cf5c022ab92c410fe2d.tar.gz
fuzzycat-1cc2de7f7349b08d4e807cf5c022ab92c410fe2d.zip
add cases
Diffstat (limited to 'fuzzycat')
-rw-r--r--fuzzycat/common.py1
-rw-r--r--fuzzycat/verify.py22
2 files changed, 23 insertions, 0 deletions
diff --git a/fuzzycat/common.py b/fuzzycat/common.py
index 5bf033c..20a5ddd 100644
--- a/fuzzycat/common.py
+++ b/fuzzycat/common.py
@@ -56,6 +56,7 @@ class Miss(str, Enum):
DATASET_DOI = 'miss.dataset_doi'
JSTOR_ID = 'miss.jstor_id'
NUM_DIFF = 'miss.num_diff'
+ PAGE_COUNT = 'miss.page_count'
RELEASE_TYPE = 'miss.release_type'
SHARED_DOI_PREFIX = 'miss.shared_doi_prefix'
SHORT_TITLE = 'miss.short_title'
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 88e83d5..76571da 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -322,6 +322,8 @@ def compare(a, b):
])
if len(types & ignore_release_types) == 0:
return (Status.DIFFERENT, Miss.RELEASE_TYPE)
+ if "dataset" in types and ("article" in types or "article-journal" in types):
+ return (Status.DIFFERENT, Miss.RELEASE_TYPE)
except PathAccessError:
pass
@@ -543,4 +545,24 @@ def compare(a, b):
except PathAccessError:
pass
+ # If pages exists, but differ too much, bail out.
+ # https://fatcat.wiki/release/tm3gaiumkvb3xc7t3i6suna6u4
+ # https://fatcat.wiki/release/r6dj63wh3zcrrolisn6xuacnve
+ try:
+ a_pages = glom(a, "pages")
+ b_pages = glom(b, "pages")
+ page_pattern = re.compile("([0-9]{1,})-([0-9]{1,})")
+ a_match = page_pattern.match(a_pages)
+ b_match = page_pattern.match(b_pages)
+ if a_match and b_match:
+ a_start, a_end = a_match.groups()
+ b_start, b_end = b_match.groups()
+ a_num_pages = int(a_end) - int(a_start)
+ b_num_pages = int(b_end) - int(b_start)
+ if a_num_pages >= 0 and b_num_pages >= 0:
+ if abs(a_num_pages - b_num_pages) > 5:
+ return (Status.DIFFERENT, Miss.PAGE_COUNT)
+ except PathAccessError:
+ pass
+
return (Status.AMBIGUOUS, OK.DUMMY)