summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-05-29 18:13:53 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-05-29 18:13:53 -0700
commite4eabbf46ff5fa0982ee5acbfc04400c7836c29a (patch)
tree1f8e2c92368b5bfb4892d7a5b4209df6bd78436c
parentab9f5664a89cb19fbe17b42332d2d7284ab9c416 (diff)
downloadfatcat-e4eabbf46ff5fa0982ee5acbfc04400c7836c29a.tar.gz
fatcat-e4eabbf46ff5fa0982ee5acbfc04400c7836c29a.zip
is_cjk() handles kanji better
-rw-r--r--python/fatcat_tools/importers/common.py10
1 files changed, 6 insertions, 4 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 79425618..43ca1b10 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -147,7 +147,8 @@ def is_cjk(s):
return False
for c in s:
if c.isalpha():
- return unicodedata.name(c).startswith("CJK")
+ lang_prefix = unicodedata.name(c).split()[0]
+ return lang_prefix in ('CJK', 'HIRAGANA', 'KATAKANA', 'HANGUL')
return False
def test_is_cjk():
@@ -159,9 +160,10 @@ def test_is_cjk():
assert is_cjk('菊') == True
assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') == True
assert is_cjk('水道') == True
- # TODO: assert is_cjk('ひヒ') == True
- # TODO: assert is_cjk('き゚ゅ') == True
- # TODO: assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') == True
+ assert is_cjk('オウ, イク') == True # kanji
+ assert is_cjk('ひヒ') == True
+ assert is_cjk('き゚ゅ') == True
+ assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') == True
DOMAIN_REL_MAP = {
"archive.org": "archive",