diff options
Diffstat (limited to 'python/fatcat_tools/importers/common.py')
-rw-r--r-- | python/fatcat_tools/importers/common.py | 83 |
1 files changed, 78 insertions, 5 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 7fca38cf..e37d57ec 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -20,6 +20,75 @@ DATE_FMT = "%Y-%m-%d" SANE_MAX_RELEASES = 200 SANE_MAX_URLS = 100 +# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of +# 2/T and 2/B? +# PubMed/MEDLINE and JSTOR use these MARC codes +LANG_MAP_MARC = { + 'afr': 'af', + 'alb': 'sq', + 'amh': 'am', + 'ara': 'ar', + 'arm': 'hy', + 'aze': 'az', + 'ben': 'bn', + 'bos': 'bs', + 'bul': 'bg', + 'cat': 'ca', + 'chi': 'zh', + 'cze': 'cs', + 'dan': 'da', + 'dut': 'nl', + 'eng': 'en', + 'epo': 'eo', + 'est': 'et', + 'fin': 'fi', + 'fre': 'fr', + 'geo': 'ka', + 'ger': 'de', + 'gla': 'gd', + 'gre': 'el', + 'heb': 'he', + 'hin': 'hi', + 'hrv': 'hr', + 'hun': 'hu', + 'ice': 'is', + 'ind': 'id', + 'ita': 'it', + 'jpn': 'ja', + 'kin': 'rw', + 'kor': 'ko', + 'lat': 'la', + 'lav': 'lv', + 'lit': 'lt', + 'mac': 'mk', + 'mal': 'ml', + 'mao': 'mi', + 'may': 'ms', +# mul, Multiple languages + 'nor': 'no', + 'per': 'fa', + 'per': 'fa', + 'pol': 'pl', + 'por': 'pt', + 'pus': 'ps', + 'rum': 'ro', + 'rus': 'ru', + 'san': 'sa', + 'slo': 'sk', + 'slv': 'sl', + 'spa': 'es', + 'srp': 'sr', + 'swe': 'sv', + 'tha': 'th', + 'tur': 'tr', + 'ukr': 'uk', +# und, Undetermined + 'urd': 'ur', + 'vie': 'vi', + 'wel': 'cy', +} + + def clean(thing, force_xml=False): """ This function is appropriate to be called on any random, non-markup string, @@ -58,19 +127,23 @@ def test_clean(): def is_cjk(s): if not s: return False - return unicodedata.name(s[0]).startswith("CJK") + for c in s: + if c.isalpha(): + return unicodedata.name(c).startswith("CJK") + return False def test_is_cjk(): assert is_cjk(None) == False assert is_cjk('') == False assert is_cjk('blah') == False assert is_cjk('岡, 鹿, 梨, 阜, 埼') == True + assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') == True assert is_cjk('菊') == True - assert is_cjk('ひヒ') == True - assert is_cjk('english with ひヒ') == True - assert is_cjk('き゚ゅ') == True + assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') == True assert is_cjk('水道') == True - assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') == True + # TODO: assert is_cjk('ひヒ') == True + # TODO: assert is_cjk('き゚ゅ') == True + # TODO: assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') == True DOMAIN_REL_MAP = { "archive.org": "archive", |