2 files changed, 326 insertions, 154 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 14415683..3c810391 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -3,12 +3,9 @@ import re
 import sys
 import csv
 import json
-import ftfy
-import base64
 import sqlite3
 import datetime
 import subprocess
-import unicodedata
 from collections import Counter
 from confluent_kafka import Consumer, KafkaException
 import xml.etree.ElementTree as ET
@@ -18,162 +15,13 @@ from bs4 import BeautifulSoup
 import fatcat_openapi_client
 from fatcat_openapi_client.rest import ApiException
 
+# TODO: refactor so remove need for this (re-imports for backwards compatibility)
+from fatcat_tools.normal import (clean_str as clean, is_cjk, b32_hex, LANG_MAP_MARC) # noqa: F401
 
 DATE_FMT = "%Y-%m-%d"
 SANE_MAX_RELEASES = 200
 SANE_MAX_URLS = 100
 
-# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of
-# 2/T and 2/B?
-# PubMed/MEDLINE and JSTOR use these MARC codes
-# https://www.loc.gov/marc/languages/language_name.html
-LANG_MAP_MARC = {
-    'afr': 'af',
-    'alb': 'sq',
-    'amh': 'am',
-    'ara': 'ar',
-    'arm': 'hy',
-    'aze': 'az',
-    'ben': 'bn',
-    'bos': 'bs',
-    'bul': 'bg',
-    'cat': 'ca',
-    'chi': 'zh',
-    'cze': 'cs',
-    'dan': 'da',
-    'dut': 'nl',
-    'eng': 'en',
-    'epo': 'eo',
-    'est': 'et',
-    'fin': 'fi',
-    'fre': 'fr',
-    'geo': 'ka',
-    'ger': 'de',
-    'gla': 'gd',
-    'gre': 'el',
-    'heb': 'he',
-    'hin': 'hi',
-    'hrv': 'hr',
-    'hun': 'hu',
-    'ice': 'is',
-    'ind': 'id',
-    'ita': 'it',
-    'jpn': 'ja',
-    'kin': 'rw',
-    'kor': 'ko',
-    'lat': 'la',
-    'lav': 'lv',
-    'lit': 'lt',
-    'mac': 'mk',
-    'mal': 'ml',
-    'mao': 'mi',
-    'may': 'ms',
-    'nor': 'no',
-    'per': 'fa',
-    'per': 'fa',
-    'pol': 'pl',
-    'por': 'pt',
-    'pus': 'ps',
-    'rum': 'ro',
-    'rus': 'ru',
-    'san': 'sa',
-    'slo': 'sk',
-    'slv': 'sl',
-    'spa': 'es',
-    'srp': 'sr',
-    'swe': 'sv',
-    'tha': 'th',
-    'tur': 'tr',
-    'ukr': 'uk',
-    'urd': 'ur',
-    'vie': 'vi',
-    'wel': 'cy',
-
-# additions
-    'gle': 'ga', # "Irish" (Gaelic)
-    'jav': 'jv', # Javanese
-    'welsh': 'cy', # Welsh
-    'oci': 'oc', # Occitan
-
-# Don't have ISO 639-1 codes
-    'grc': 'el', # Ancient Greek; map to modern greek
-    'map': None, # Austronesian (collection)
-    'syr': None, # Syriac, Modern
-    'gem': None, # Old Saxon
-    'non': None, # Old Norse
-    'emg': None, # Eastern Meohang
-    'neg': None, # Negidal
-    'mul': None, # Multiple languages
-    'und': None, # Undetermined
-}
-
-
-def clean(thing, force_xml=False):
-    """
-    This function is appropriate to be called on any random, non-markup string,
-    such as author names, titles, etc.
-
-    It will try to clean up common unicode mangles, HTML characters, etc.
-
-    This will detect XML/HTML and "do the right thing" (aka, not remove
-    entities like '&amp' if there are tags in the string), unless you pass the
-    'force_xml' parameter, which might be appropriate for, eg, names and
-    titles, which generally should be projected down to plain text.
-
-    Also strips extra whitespace.
-    """
-    if not thing:
-        return None
-    fix_entities = 'auto'
-    if force_xml:
-        fix_entities = True
-    fixed = ftfy.fix_text(thing, fix_entities=fix_entities).strip()
-    if not fixed or len(fixed) <= 1:
-        # wasn't zero-length before, but is now; return None
-        return None
-    return fixed
-
-def test_clean():
-
-    assert clean(None) == None
-    assert clean('') == None
-    assert clean('1') == None
-    assert clean('123') == '123'
-    assert clean('a&amp;b') == 'a&b'
-    assert clean('<b>a&amp;b</b>') == '<b>a&amp;b</b>'
-    assert clean('<b>a&amp;b</b>', force_xml=True) == '<b>a&b</b>'
-
-def b32_hex(s):
-    s = s.strip().split()[0].lower()
-    if s.startswith("sha1:"):
-        s = s[5:]
-    if len(s) != 32:
-        return s
-    return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
-
-def is_cjk(s):
-    if not s:
-        return False
-    for c in s:
-        if c.isalpha():
-            lang_prefix = unicodedata.name(c).split()[0]
-            return lang_prefix in ('CJK', 'HIRAGANA', 'KATAKANA', 'HANGUL')
-    return False
-
-def test_is_cjk():
-    assert is_cjk(None) is False
-    assert is_cjk('') is False
-    assert is_cjk('blah') is False
-    assert is_cjk('岡, 鹿, 梨, 阜, 埼') is True
-    assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') is True
-    assert is_cjk('菊') is True
-    assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') is True
-    assert is_cjk('水道') is True
-    assert is_cjk('オウ, イク') is True # kanji
-    assert is_cjk('ひヒ') is True
-    assert is_cjk('き゚ゅ') is True
-    assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') is True
-
 DOMAIN_REL_MAP = {
     "archive.org": "archive",
     # LOCKSS, Portico, DuraSpace, etc would also be "archive"
@@ -444,6 +292,7 @@ class EntityImporter:
         raise NotImplementedError
 
     def is_orcid(self, orcid):
+        # TODO: replace with clean_orcid() from fatcat_tools.normal
         return self._orcid_regex.match(orcid) is not None
 
     def lookup_orcid(self, orcid):
@@ -464,6 +313,7 @@ class EntityImporter:
         return creator_id
 
     def is_doi(self, doi):
+        # TODO: replace with clean_doi() from fatcat_tools.normal
         return doi.startswith("10.") and doi.count("/") >= 1
 
     def lookup_doi(self, doi):
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
index 10a90dba..39927651 100644
--- a/python/fatcat_tools/normal.py
+++ b/python/fatcat_tools/normal.py
@@ -5,6 +5,13 @@ free-form input, titles, etc.
 """
 
 import re
+import base64
+from typing import Optional
+import unicodedata
+
+import ftfy
+import langdetect
+import pycountry
 
 DOI_REGEX = re.compile(r"^10.\d{3,6}/\S+$")
 
@@ -233,3 +240,318 @@ def test_clean_orcid():
     assert clean_orcid("0123-4567-3456-6789 ") == "0123-4567-3456-6789"
     assert clean_orcid("01234567-3456-6780") == None
     assert clean_orcid("0x23-4567-3456-6780") == None
+
+
+def clean_str(thing: Optional[str], force_xml: bool = False) -> Optional[str]:
+    """
+    This function is appropriate to be called on any random, non-markup string,
+    such as author names, titles, etc.
+
+    It will try to clean up common unicode mangles, HTML characters, etc.
+
+    This will detect XML/HTML and "do the right thing" (aka, not remove
+    entities like '&amp' if there are tags in the string), unless you pass the
+    'force_xml' parameter, which might be appropriate for, eg, names and
+    titles, which generally should be projected down to plain text.
+
+    Also strips extra whitespace.
+    """
+    if not thing:
+        return None
+    fix_entities = 'auto'
+    if force_xml:
+        fix_entities = True
+    fixed = ftfy.fix_text(thing, fix_entities=fix_entities).strip()
+    if not fixed or len(fixed) <= 1:
+        # wasn't zero-length before, but is now; return None
+        return None
+    return fixed
+
+def test_clean_str():
+
+    assert clean_str(None) == None
+    assert clean_str('') == None
+    assert clean_str('1') == None
+    assert clean_str('123') == '123'
+    assert clean_str('a&amp;b') == 'a&b'
+    assert clean_str('<b>a&amp;b</b>') == '<b>a&amp;b</b>'
+    assert clean_str('<b>a&amp;b</b>', force_xml=True) == '<b>a&b</b>'
+
+def b32_hex(s):
+    s = s.strip().split()[0].lower()
+    if s.startswith("sha1:"):
+        s = s[5:]
+    if len(s) != 32:
+        return s
+    return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+
+def is_cjk(s):
+    if not s:
+        return False
+    for c in s:
+        if c.isalpha():
+            lang_prefix = unicodedata.name(c).split()[0]
+            return lang_prefix in ('CJK', 'HIRAGANA', 'KATAKANA', 'HANGUL')
+    return False
+
+def test_is_cjk():
+    assert is_cjk(None) is False
+    assert is_cjk('') is False
+    assert is_cjk('blah') is False
+    assert is_cjk('岡, 鹿, 梨, 阜, 埼') is True
+    assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') is True
+    assert is_cjk('菊') is True
+    assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') is True
+    assert is_cjk('水道') is True
+    assert is_cjk('オウ, イク') is True # kanji
+    assert is_cjk('ひヒ') is True
+    assert is_cjk('き゚ゅ') is True
+    assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') is True
+
+MONTH_MAP = {
+    "jan":  1, "january":   1,
+    "feb":  2, "febuary":   2,
+    "mar":  3, "march":     3,
+    "apr":  4, "april":     4,
+    "may":  5, "may":       5,
+    "jun":  6, "june":      6,
+    "jul":  7, "july":      7,
+    "aug":  8, "august":    8,
+    "sep":  9, "september": 9,
+    "oct": 10, "october":   10,
+    "nov": 11, "nov":       11,
+    "dec": 12, "december":  12,
+}
+
+def parse_month(raw: Optional[str]) -> Optional[int]:
+    """
+    Parses a string into a month number (1 to 12)
+    """
+    if not raw:
+        return None
+    raw = raw.strip().lower()
+    if raw.isdigit():
+        raw_int = int(raw)
+        if raw_int >= 1 and raw_int <= 12:
+            return raw_int
+        else:
+            return None
+    if raw in MONTH_MAP:
+        return MONTH_MAP[raw]
+    return None
+
+def test_parse_month() -> None:
+
+    assert parse_month(None) == None
+    assert parse_month("") == None
+    assert parse_month("0") == None
+    assert parse_month("10") == 10
+    assert parse_month("jan") == 1
+    assert parse_month("September") == 9
+
+def detect_text_lang(raw: str) -> Optional[str]:
+    """
+    Tries to determine language of, eg, an abstract.
+
+    Returns an ISO 631 2-char language code, or None.
+    """
+    if not raw:
+        return None
+    try:
+        lang = langdetect.detect(raw)
+        assert len(lang) == 2
+        return lang
+    except (langdetect.lang_detect_exception.LangDetectException, TypeError):
+        return None
+    return None
+
+def test_detect_text_lang() -> None:
+    assert detect_text_lang("") == None
+    EN_SAMPLE = "this is a string of English text for testing"
+    assert detect_text_lang(EN_SAMPLE) == "en"
+    JA_SAMPLE = "モーラの種類は、以下に示すように111程度存在する。ただし、研究者により数え方が少しずつ異なる。"
+    assert detect_text_lang(JA_SAMPLE) == "ja"
+
+def parse_lang_name(raw: Optional[str]) -> Optional[str]:
+    """
+    Parses a language name and returns a 2-char ISO 631 language code.
+    """
+    if not raw:
+        return None
+    try:
+        lang = pycountry.languages.lookup(raw)
+        if lang.alpha_3 in ("mul", "mis"):
+            return None
+        return lang.alpha_2.lower()
+    except LookupError:
+        #print(f"  unknown language: '{raw}', file=sys.stderr)
+        return None
+    except AttributeError:
+        #print(f"  partial language metadata: '{lang}', file=sys.stderr)
+        return None
+    return None
+
+def test_parse_lang_name() -> None:
+
+    assert parse_lang_name(None) == None
+    assert parse_lang_name("") == None
+    assert parse_lang_name("asdf ") == None
+    assert parse_lang_name("english") == "en"
+    assert parse_lang_name("ENGLISH") == "en"
+    assert parse_lang_name("asdf blah") is None
+    assert parse_lang_name("en") == "en"
+    assert parse_lang_name("EN") == "en"
+    assert parse_lang_name("ENG") == "en"
+    assert parse_lang_name("English") == "en"
+    assert parse_lang_name("Portuguese") == "pt"
+
+
+def parse_country_name(s: Optional[str]) -> Optional[str]:
+    """
+    Parses a country name into a ISO country code (2-char).
+
+    This version copied from the chocula repository.
+    """
+    if not s or s in ("Unknown"):
+        return None
+
+    s = s.strip()
+    if s.lower() in ("usa", "new york (state)", "washington (state)"):
+        return "us"
+    if s.lower() in ("russia (federation)", "russia"):
+        return "ru"
+    if s == "Québec (Province)":
+        s = "Canada"
+    if s == "China (Republic : 1949- )":
+        return "tw"
+    if s == "Brunei":
+        return "bn"
+    if s.startswith("Congo "):
+        s = "Congo"
+    if s.lower() == "iran":
+        return "ir"
+    if s.lower() == "bermuda islands":
+        return "bm"
+    if s.lower() == "burma":
+        s = "myanmar"
+    if s.lower() in ("korea (south)", "south korea"):
+        return "kr"
+    if s.lower() in ("england", "scotland", "wales"):
+        return "uk"
+    s = s.replace(" (Republic)", "").replace(" (Federation)", "")
+
+    try:
+        country = pycountry.countries.lookup(s)
+    except LookupError:
+        country = None
+
+    if country:
+        return country.alpha_2.lower()
+    try:
+        sub = pycountry.subdivisions.lookup(s)
+    except LookupError:
+        sub = None
+
+    s = s.replace(" (State)", "").replace(" (Province)", "")
+    if sub:
+        return sub.country_code.lower()
+
+    else:
+        # print(f"unknown country: {s}", file=sys.stderr)
+        return None
+
+
+def test_parse_country_name():
+    assert parse_country_name("") is None
+    assert parse_country_name("asdf blah") is None
+    assert parse_country_name("us") == "us"
+    assert parse_country_name("USA") == "us"
+    assert parse_country_name("United States of America") == "us"
+    assert parse_country_name("united States") == "us"
+    assert parse_country_name("Massachusetts") == "us"
+    assert parse_country_name("Russia") == "ru"
+    assert parse_country_name("Japan") == "jp"
+
+# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of
+# 2/T and 2/B?
+# PubMed/MEDLINE and JSTOR use these MARC codes
+# https://www.loc.gov/marc/languages/language_name.html
+LANG_MAP_MARC = {
+    'afr': 'af',
+    'alb': 'sq',
+    'amh': 'am',
+    'ara': 'ar',
+    'arm': 'hy',
+    'aze': 'az',
+    'ben': 'bn',
+    'bos': 'bs',
+    'bul': 'bg',
+    'cat': 'ca',
+    'chi': 'zh',
+    'cze': 'cs',
+    'dan': 'da',
+    'dut': 'nl',
+    'eng': 'en',
+    'epo': 'eo',
+    'est': 'et',
+    'fin': 'fi',
+    'fre': 'fr',
+    'geo': 'ka',
+    'ger': 'de',
+    'gla': 'gd',
+    'gre': 'el',
+    'heb': 'he',
+    'hin': 'hi',
+    'hrv': 'hr',
+    'hun': 'hu',
+    'ice': 'is',
+    'ind': 'id',
+    'ita': 'it',
+    'jpn': 'ja',
+    'kin': 'rw',
+    'kor': 'ko',
+    'lat': 'la',
+    'lav': 'lv',
+    'lit': 'lt',
+    'mac': 'mk',
+    'mal': 'ml',
+    'mao': 'mi',
+    'may': 'ms',
+    'nor': 'no',
+    'per': 'fa',
+    'per': 'fa',
+    'pol': 'pl',
+    'por': 'pt',
+    'pus': 'ps',
+    'rum': 'ro',
+    'rus': 'ru',
+    'san': 'sa',
+    'slo': 'sk',
+    'slv': 'sl',
+    'spa': 'es',
+    'srp': 'sr',
+    'swe': 'sv',
+    'tha': 'th',
+    'tur': 'tr',
+    'ukr': 'uk',
+    'urd': 'ur',
+    'vie': 'vi',
+    'wel': 'cy',
+
+# additions
+    'gle': 'ga', # "Irish" (Gaelic)
+    'jav': 'jv', # Javanese
+    'welsh': 'cy', # Welsh
+    'oci': 'oc', # Occitan
+
+# Don't have ISO 639-1 codes
+    'grc': 'el', # Ancient Greek; map to modern greek
+    'map': None, # Austronesian (collection)
+    'syr': None, # Syriac, Modern
+    'gem': None, # Old Saxon
+    'non': None, # Old Norse
+    'emg': None, # Eastern Meohang
+    'neg': None, # Negidal
+    'mul': None, # Multiple languages
+    'und': None, # Undetermined
+}