diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-12-02 18:59:59 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-12-02 18:59:59 +0100 |
commit | 03f65cfcb88451fecb106c2c750643931b9ecc77 (patch) | |
tree | 5922ce17de6c79c6e4f46de85acbfedbb4bc23df /fuzzycat | |
parent | 50bf407e8f7e484eaef4a02dc44e59b3a8ceeef8 (diff) | |
download | fuzzycat-03f65cfcb88451fecb106c2c750643931b9ecc77.tar.gz fuzzycat-03f65cfcb88451fecb106c2c750643931b9ecc77.zip |
add iop case
Diffstat (limited to 'fuzzycat')
-rw-r--r-- | fuzzycat/common.py | 3 | ||||
-rw-r--r-- | fuzzycat/verify.py | 14 |
2 files changed, 14 insertions, 3 deletions
diff --git a/fuzzycat/common.py b/fuzzycat/common.py index 8ebc43e..60f42ab 100644 --- a/fuzzycat/common.py +++ b/fuzzycat/common.py @@ -43,6 +43,8 @@ class Miss(str, Enum): COMPONENT = 'miss.component' CONTAINER = 'miss.container' CONTRIB_INTERSECTION_EMPTY = 'miss.contrib_intersection_empty' + CUSTOM_IOP_MA_PATTERN = 'miss.custom_iop_ma_pattern' + CUSTOM_PREFIX_10_14288 = 'miss.custom_prefix_10_14288' CUSTOM_VHS = 'miss.vhs' # https://fatcat.wiki/release/44gk5ben5vghljq6twm7lwmxla DATASET_DOI = 'miss.dataset_doi' NUM_DIFF = 'miss.num_diff' @@ -51,4 +53,3 @@ class Miss(str, Enum): SUBTITLE = 'miss.subtitle' TITLE_FILENAME = 'miss.title_filename' YEAR = 'miss.year' - CUSTOM_PREFIX_10_14288 = 'miss.custom_prefix_10_14288' diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index f44d9db..993b7c9 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -159,12 +159,10 @@ def compare(a, b): if fragment in a_title_lower: return (Status.AMBIGUOUS, Miss.BLACKLISTED_FRAGMENT) - # https://fatcat.wiki/release/rnso2swxzvfonemgzrth3arumi, # https://fatcat.wiki/release/caxa7qbfqvg3bkgz4nwvapgnvi if "subject index" in a_title_lower and "subject index" in b_title_lower: try: - print(a, b) if glom(a, "container_id") != glom(b, "container_id"): return (Status.DIFFERENT, Miss.CONTAINER) except PathAccessError: @@ -198,6 +196,16 @@ def compare(a, b): except PathAccessError: pass + try: + a_doi = glom(a, "ext_ids.doi") + b_doi = glom(b, "ext_ids.doi") + if has_doi_prefix(a_doi, "10.1149") and has_doi_prefix(b_doi, "10.1149"): + if (a_doi.startswith("10.1149/ma") and not b_doi.startswith("10.1149/ma") + or b_doi.startswith("10.1149/ma") and not a_doi.startswith("10.1149/ma")): + return (Status.DIFFERENT, Miss.CUSTOM_IOP_MA_PATTERN) + except PathAccessError: + pass + if "Zweckverband Volkshochschule " in a_title and a_title != b_title: return (Status.DIFFERENT, Miss.CUSTOM_VHS) @@ -426,6 +434,8 @@ TITLE_FRAGMENT_BLACKLIST = set([ "nouvelles du corps médical", "student government minutes:", "untersuchung einzelner abdominaler regionen und organe", + "annual general meeting", + "records of meetings", ]) CONTAINER_NAME_BLACKLIST = set([ |