diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-12-01 23:09:00 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-12-01 23:09:00 +0100 |
commit | 8183e792ae122ae66b66299da1948697ae296ac7 (patch) | |
tree | 53d24af28488737e639ef3f05efa55f7dc07c1d7 /fuzzycat | |
parent | 27e6e0b07b091cd0dd3e66a01ebed846dc9e9f50 (diff) | |
download | fuzzycat-8183e792ae122ae66b66299da1948697ae296ac7.tar.gz fuzzycat-8183e792ae122ae66b66299da1948697ae296ac7.zip |
add another case
Diffstat (limited to 'fuzzycat')
-rw-r--r-- | fuzzycat/common.py | 1 | ||||
-rw-r--r-- | fuzzycat/utils.py | 2 | ||||
-rw-r--r-- | fuzzycat/verify.py | 33 |
3 files changed, 34 insertions, 2 deletions
diff --git a/fuzzycat/common.py b/fuzzycat/common.py index 3973b1e..2298185 100644 --- a/fuzzycat/common.py +++ b/fuzzycat/common.py @@ -25,6 +25,7 @@ class OK(str, Enum): SLUG_TITLE_AUTHOR_MATCH = 'ok.slug_title_author_match' TITLE_AUTHOR_MATCH = 'ok.title_author_match' TOKENIZED_AUTHORS = 'ok.tokenized_authors' + CUSTOM_IEEE_ARXIV = 'ok.custom_ieee_arxiv' class Miss(str, Enum): diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py index 4d1325d..d6beb03 100644 --- a/fuzzycat/utils.py +++ b/fuzzycat/utils.py @@ -13,7 +13,7 @@ def slugify_string(s: str) -> str: """ Keeps ascii chars and single whitespace only. """ - return ''.join((c for c in s.lower() if c in printable_no_punct)) + return ' '.join(''.join((c for c in s.lower() if c in printable_no_punct)).split()) def cut(f: int = 0, sep: str = '\t', ignore_missing_column: bool = True): diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 81c97ff..84e17d8 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -240,7 +240,12 @@ def compare(a, b): # Added "entry" via # https://fatcat.wiki/release/xp3oxb7tqbgaxdzkzbchfkcjn4, # https://fatcat.wiki/release/73pcaauzwbalvi7aqhv6vopxl4 - ignore_release_types = set(["article", "article-journal", "report", "paper-conference", "entry", "book"]) + ignore_release_types = set([ + "article", + "article-journal", + "report", + "paper-conference", + ]) if len(types & ignore_release_types) == 0: return (Status.DIFFERENT, Miss.RELEASE_TYPE) except PathAccessError: @@ -270,6 +275,32 @@ def compare(a, b): a_slug_title = slugify_string(a.get("title", "")).replace("\n", " ") b_slug_title = slugify_string(b.get("title", "")).replace("\n", " ") + try: + if glom(a, "ext_ids.doi") == "10.1109/nssmic.2013.6829591": + print(a_slug_title) + print(b_slug_title) + except PathAccessError: + pass + + if a_slug_title == b_slug_title: + # via: https://fatcat.wiki/release/ij3yuoh6lrh3tkrv5o7gfk6yyi + # https://fatcat.wiki/release/tur236mqljdfdnlzbbnks2sily + def ieee_arxiv_pair_check(a, b): + try: + print(a_slug_title, glom(a, "ext_ids.doi")) + if (glom(a, "ext_ids.doi").split("/")[0] == "10.1109" + and glom(b, "ext_ids.arxiv") != ""): + return (Status.STRONG, OK.CUSTOM_IEEE_ARXIV) + except PathAccessError: + pass + + result = ieee_arxiv_pair_check(a, b) + if result: + return result + result = ieee_arxiv_pair_check(b, a) + if result: + return result + if a_slug_title == b_slug_title: try: a_subtitles = glom(a, "extra.subtitle") or [] |