tweaks to new imports/tests

author: Bryan Newbold <bnewbold@robocracy.org> 2019-05-16 13:53:15 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2019-05-21 11:41:29 -0700
commit: 6ff79f47c7c7ae27b28685674672e58b7dd4d271 (patch)
tree: 2470f89de864207da8ccc92151cb35d5e20ba21b /python/fatcat_tools/importers/common.py
parent: 300665927f578151321b0d91b28f8aadffcf227d (diff)
download: fatcat-6ff79f47c7c7ae27b28685674672e58b7dd4d271.tar.gz
fatcat-6ff79f47c7c7ae27b28685674672e58b7dd4d271.zip
1 files changed, 78 insertions, 5 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 7fca38cf..e37d57ec 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -20,6 +20,75 @@ DATE_FMT = "%Y-%m-%d"
 SANE_MAX_RELEASES = 200
 SANE_MAX_URLS = 100
 
+# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of
+# 2/T and 2/B?
+# PubMed/MEDLINE and JSTOR use these MARC codes
+LANG_MAP_MARC = {
+    'afr': 'af',
+    'alb': 'sq',
+    'amh': 'am',
+    'ara': 'ar',
+    'arm': 'hy',
+    'aze': 'az',
+    'ben': 'bn',
+    'bos': 'bs',
+    'bul': 'bg',
+    'cat': 'ca',
+    'chi': 'zh',
+    'cze': 'cs',
+    'dan': 'da',
+    'dut': 'nl',
+    'eng': 'en',
+    'epo': 'eo',
+    'est': 'et',
+    'fin': 'fi',
+    'fre': 'fr',
+    'geo': 'ka',
+    'ger': 'de',
+    'gla': 'gd',
+    'gre': 'el',
+    'heb': 'he',
+    'hin': 'hi',
+    'hrv': 'hr',
+    'hun': 'hu',
+    'ice': 'is',
+    'ind': 'id',
+    'ita': 'it',
+    'jpn': 'ja',
+    'kin': 'rw',
+    'kor': 'ko',
+    'lat': 'la',
+    'lav': 'lv',
+    'lit': 'lt',
+    'mac': 'mk',
+    'mal': 'ml',
+    'mao': 'mi',
+    'may': 'ms',
+#    mul, Multiple languages
+    'nor': 'no',
+    'per': 'fa',
+    'per': 'fa',
+    'pol': 'pl',
+    'por': 'pt',
+    'pus': 'ps',
+    'rum': 'ro',
+    'rus': 'ru',
+    'san': 'sa',
+    'slo': 'sk',
+    'slv': 'sl',
+    'spa': 'es',
+    'srp': 'sr',
+    'swe': 'sv',
+    'tha': 'th',
+    'tur': 'tr',
+    'ukr': 'uk',
+#    und, Undetermined
+    'urd': 'ur',
+    'vie': 'vi',
+    'wel': 'cy',
+}
+
+
 def clean(thing, force_xml=False):
     """
     This function is appropriate to be called on any random, non-markup string,
@@ -58,19 +127,23 @@ def test_clean():
 def is_cjk(s):
     if not s:
         return False
-    return unicodedata.name(s[0]).startswith("CJK")
+    for c in s:
+        if c.isalpha():
+            return unicodedata.name(c).startswith("CJK")
+    return False
 
 def test_is_cjk():
     assert is_cjk(None) == False
     assert is_cjk('') == False
     assert is_cjk('blah') == False
     assert is_cjk('岡, 鹿, 梨, 阜, 埼') == True
+    assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') == True
     assert is_cjk('菊') == True
-    assert is_cjk('ひヒ') == True
-    assert is_cjk('english with ひヒ') == True
-    assert is_cjk('き゚ゅ') == True
+    assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') == True
     assert is_cjk('水道') == True
-    assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') == True
+    # TODO: assert is_cjk('ひヒ') == True
+    # TODO: assert is_cjk('き゚ゅ') == True
+    # TODO: assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') == True
 
 DOMAIN_REL_MAP = {
     "archive.org": "archive",
author	Bryan Newbold <bnewbold@robocracy.org>	2019-05-16 13:53:15 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2019-05-21 11:41:29 -0700
commit	6ff79f47c7c7ae27b28685674672e58b7dd4d271 (patch)
tree	2470f89de864207da8ccc92151cb35d5e20ba21b /python/fatcat_tools/importers/common.py
parent	300665927f578151321b0d91b28f8aadffcf227d (diff)
download	fatcat-6ff79f47c7c7ae27b28685674672e58b7dd4d271.tar.gz fatcat-6ff79f47c7c7ae27b28685674672e58b7dd4d271.zip