From e4eabbf46ff5fa0982ee5acbfc04400c7836c29a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 29 May 2019 18:13:53 -0700 Subject: is_cjk() handles kanji better --- python/fatcat_tools/importers/common.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 79425618..43ca1b10 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -147,7 +147,8 @@ def is_cjk(s): return False for c in s: if c.isalpha(): - return unicodedata.name(c).startswith("CJK") + lang_prefix = unicodedata.name(c).split()[0] + return lang_prefix in ('CJK', 'HIRAGANA', 'KATAKANA', 'HANGUL') return False def test_is_cjk(): @@ -159,9 +160,10 @@ def test_is_cjk(): assert is_cjk('菊') == True assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') == True assert is_cjk('水道') == True - # TODO: assert is_cjk('ひヒ') == True - # TODO: assert is_cjk('き゚ゅ') == True - # TODO: assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') == True + assert is_cjk('オウ, イク') == True # kanji + assert is_cjk('ひヒ') == True + assert is_cjk('き゚ゅ') == True + assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') == True DOMAIN_REL_MAP = { "archive.org": "archive", -- cgit v1.2.3