diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-29 18:13:53 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-05-29 18:13:53 -0700 | 
| commit | e4eabbf46ff5fa0982ee5acbfc04400c7836c29a (patch) | |
| tree | 1f8e2c92368b5bfb4892d7a5b4209df6bd78436c /python/fatcat_tools | |
| parent | ab9f5664a89cb19fbe17b42332d2d7284ab9c416 (diff) | |
| download | fatcat-e4eabbf46ff5fa0982ee5acbfc04400c7836c29a.tar.gz fatcat-e4eabbf46ff5fa0982ee5acbfc04400c7836c29a.zip | |
is_cjk() handles kanji better
Diffstat (limited to 'python/fatcat_tools')
| -rw-r--r-- | python/fatcat_tools/importers/common.py | 10 | 
1 files changed, 6 insertions, 4 deletions
| diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 79425618..43ca1b10 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -147,7 +147,8 @@ def is_cjk(s):          return False      for c in s:          if c.isalpha(): -            return unicodedata.name(c).startswith("CJK") +            lang_prefix = unicodedata.name(c).split()[0] +            return lang_prefix in ('CJK', 'HIRAGANA', 'KATAKANA', 'HANGUL')      return False  def test_is_cjk(): @@ -159,9 +160,10 @@ def test_is_cjk():      assert is_cjk('菊') == True      assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') == True      assert is_cjk('水道') == True -    # TODO: assert is_cjk('ひヒ') == True -    # TODO: assert is_cjk('き゚ゅ') == True -    # TODO: assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') == True +    assert is_cjk('オウ, イク') == True # kanji +    assert is_cjk('ひヒ') == True +    assert is_cjk('き゚ゅ') == True +    assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') == True  DOMAIN_REL_MAP = {      "archive.org": "archive", | 
