aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/common.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-05-16 13:53:15 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-05-21 11:41:29 -0700
commit6ff79f47c7c7ae27b28685674672e58b7dd4d271 (patch)
tree2470f89de864207da8ccc92151cb35d5e20ba21b /python/fatcat_tools/importers/common.py
parent300665927f578151321b0d91b28f8aadffcf227d (diff)
downloadfatcat-6ff79f47c7c7ae27b28685674672e58b7dd4d271.tar.gz
fatcat-6ff79f47c7c7ae27b28685674672e58b7dd4d271.zip
tweaks to new imports/tests
Diffstat (limited to 'python/fatcat_tools/importers/common.py')
-rw-r--r--python/fatcat_tools/importers/common.py83
1 files changed, 78 insertions, 5 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 7fca38cf..e37d57ec 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -20,6 +20,75 @@ DATE_FMT = "%Y-%m-%d"
SANE_MAX_RELEASES = 200
SANE_MAX_URLS = 100
+# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of
+# 2/T and 2/B?
+# PubMed/MEDLINE and JSTOR use these MARC codes
+LANG_MAP_MARC = {
+ 'afr': 'af',
+ 'alb': 'sq',
+ 'amh': 'am',
+ 'ara': 'ar',
+ 'arm': 'hy',
+ 'aze': 'az',
+ 'ben': 'bn',
+ 'bos': 'bs',
+ 'bul': 'bg',
+ 'cat': 'ca',
+ 'chi': 'zh',
+ 'cze': 'cs',
+ 'dan': 'da',
+ 'dut': 'nl',
+ 'eng': 'en',
+ 'epo': 'eo',
+ 'est': 'et',
+ 'fin': 'fi',
+ 'fre': 'fr',
+ 'geo': 'ka',
+ 'ger': 'de',
+ 'gla': 'gd',
+ 'gre': 'el',
+ 'heb': 'he',
+ 'hin': 'hi',
+ 'hrv': 'hr',
+ 'hun': 'hu',
+ 'ice': 'is',
+ 'ind': 'id',
+ 'ita': 'it',
+ 'jpn': 'ja',
+ 'kin': 'rw',
+ 'kor': 'ko',
+ 'lat': 'la',
+ 'lav': 'lv',
+ 'lit': 'lt',
+ 'mac': 'mk',
+ 'mal': 'ml',
+ 'mao': 'mi',
+ 'may': 'ms',
+# mul, Multiple languages
+ 'nor': 'no',
+ 'per': 'fa',
+ 'per': 'fa',
+ 'pol': 'pl',
+ 'por': 'pt',
+ 'pus': 'ps',
+ 'rum': 'ro',
+ 'rus': 'ru',
+ 'san': 'sa',
+ 'slo': 'sk',
+ 'slv': 'sl',
+ 'spa': 'es',
+ 'srp': 'sr',
+ 'swe': 'sv',
+ 'tha': 'th',
+ 'tur': 'tr',
+ 'ukr': 'uk',
+# und, Undetermined
+ 'urd': 'ur',
+ 'vie': 'vi',
+ 'wel': 'cy',
+}
+
+
def clean(thing, force_xml=False):
"""
This function is appropriate to be called on any random, non-markup string,
@@ -58,19 +127,23 @@ def test_clean():
def is_cjk(s):
if not s:
return False
- return unicodedata.name(s[0]).startswith("CJK")
+ for c in s:
+ if c.isalpha():
+ return unicodedata.name(c).startswith("CJK")
+ return False
def test_is_cjk():
assert is_cjk(None) == False
assert is_cjk('') == False
assert is_cjk('blah') == False
assert is_cjk('岡, 鹿, 梨, 阜, 埼') == True
+ assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') == True
assert is_cjk('菊') == True
- assert is_cjk('ひヒ') == True
- assert is_cjk('english with ひヒ') == True
- assert is_cjk('き゚ゅ') == True
+ assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') == True
assert is_cjk('水道') == True
- assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') == True
+ # TODO: assert is_cjk('ひヒ') == True
+ # TODO: assert is_cjk('き゚ゅ') == True
+ # TODO: assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') == True
DOMAIN_REL_MAP = {
"archive.org": "archive",