1 files changed, 179 insertions, 122 deletions
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
index 9b65e768..12c58829 100644
--- a/python/fatcat_tools/normal.py
+++ b/python/fatcat_tools/normal.py
@@ -1,4 +1,3 @@
-
 """
 A bunch of helpers to parse and normalize strings: external identifiers,
 free-form input, titles, etc.
@@ -32,7 +31,7 @@ def clean_doi(raw: str) -> Optional[str]:
     if not raw:
         return None
     raw = raw.strip().lower()
-    if '\u2013' in raw:
+    if "\u2013" in raw:
         # Do not attempt to normalize "en dash" and since FC does not allow
         # unicode in DOI, treat this as invalid.
         return None
@@ -54,7 +53,7 @@ def clean_doi(raw: str) -> Optional[str]:
     # fatcatd uses same REGEX, but Rust regex rejects these characters, while
     # python doesn't. DOIs are syntaxtually valid, but very likely to be typos;
     # for now filter them out.
-    for c in ('¬', ):
+    for c in ("¬",):
         if c in raw:
             return None
 
@@ -70,6 +69,7 @@ def clean_doi(raw: str) -> Optional[str]:
         return None
     return raw
 
+
 def test_clean_doi():
     assert clean_doi("10.1234/asdf ") == "10.1234/asdf"
     assert clean_doi("10.1037//0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50"
@@ -81,7 +81,9 @@ def test_clean_doi():
     assert clean_doi("doi:10.1234/asdf ") == "10.1234/asdf"
     assert clean_doi("doi:10.1234/ asdf ") is None
     assert clean_doi("10.4149/gpb¬_2017042") is None  # "logical negation" character
-    assert clean_doi("10.6002/ect.2020.häyry") is None  # this example via pubmed (pmid:32519616)
+    assert (
+        clean_doi("10.6002/ect.2020.häyry") is None
+    )  # this example via pubmed (pmid:32519616)
     assert clean_doi("10.30466/vrf.2019.98547.2350\u200e") is None
     assert clean_doi("10.12016/j.issn.2096⁃1456.2017.06.014") is None
     assert clean_doi("10.4025/diálogos.v17i2.36030") is None
@@ -92,6 +94,7 @@ def test_clean_doi():
 
 ARXIV_ID_REGEX = re.compile(r"^(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?/\d{7})(v\d+)?$")
 
+
 def clean_arxiv_id(raw: str) -> Optional[str]:
     """
     Removes any:
@@ -113,6 +116,7 @@ def clean_arxiv_id(raw: str) -> Optional[str]:
         return None
     return raw
 
+
 def test_clean_arxiv_id():
     assert clean_arxiv_id("0806.2878v1") == "0806.2878v1"
     assert clean_arxiv_id("0806.2878") == "0806.2878"
@@ -141,16 +145,18 @@ def test_clean_arxiv_id():
     assert clean_arxiv_id("0806.v1") is None
     assert clean_arxiv_id("08062878v1") is None
 
+
 def clean_wikidata_qid(raw):
     if not raw:
         return None
     raw = raw.strip()
     if len(raw.split()) != 1 or len(raw) < 2:
         return None
-    if raw[0] == 'Q' and raw[1] != '0' and raw[1:].isdigit():
+    if raw[0] == "Q" and raw[1] != "0" and raw[1:].isdigit():
         return raw
     return None
 
+
 def test_clean_wikidata_qid():
     assert clean_wikidata_qid("Q1234") == "Q1234"
     assert clean_wikidata_qid("Q1") == "Q1"
@@ -163,6 +169,7 @@ def test_clean_wikidata_qid():
     assert clean_wikidata_qid("qfba3") is None
     assert clean_wikidata_qid("") is None
 
+
 def clean_pmid(raw: str) -> Optional[str]:
     if not raw:
         return None
@@ -173,6 +180,7 @@ def clean_pmid(raw: str) -> Optional[str]:
         return raw
     return None
 
+
 def test_clean_pmid():
     assert clean_pmid("1234") == "1234"
     assert clean_pmid("1234 ") == "1234"
@@ -180,6 +188,7 @@ def test_clean_pmid():
     assert clean_pmid("qfba3") is None
     assert clean_pmid("") is None
 
+
 def clean_pmcid(raw: str) -> Optional[str]:
     if not raw:
         return None
@@ -190,6 +199,7 @@ def clean_pmcid(raw: str) -> Optional[str]:
         return raw
     return None
 
+
 def clean_sha1(raw: str) -> Optional[str]:
     if not raw:
         return None
@@ -203,13 +213,21 @@ def clean_sha1(raw: str) -> Optional[str]:
             return None
     return raw
 
+
 def test_clean_sha1():
-    assert clean_sha1("0fba3fba0e1937aa0297de3836b768b5dfb23d7b") == "0fba3fba0e1937aa0297de3836b768b5dfb23d7b"
-    assert clean_sha1("0fba3fba0e1937aa0297de3836b768b5dfb23d7b ") == "0fba3fba0e1937aa0297de3836b768b5dfb23d7b"
+    assert (
+        clean_sha1("0fba3fba0e1937aa0297de3836b768b5dfb23d7b")
+        == "0fba3fba0e1937aa0297de3836b768b5dfb23d7b"
+    )
+    assert (
+        clean_sha1("0fba3fba0e1937aa0297de3836b768b5dfb23d7b ")
+        == "0fba3fba0e1937aa0297de3836b768b5dfb23d7b"
+    )
     assert clean_sha1("fba3fba0e1937aa0297de3836b768b5dfb23d7b") is None
     assert clean_sha1("qfba3fba0e1937aa0297de3836b768b5dfb23d7b") is None
     assert clean_sha1("0fba3fb a0e1937aa0297de3836b768b5dfb23d7b") is None
 
+
 def clean_sha256(raw: str) -> Optional[str]:
     raw = raw.strip().lower()
     if len(raw.split()) != 1:
@@ -221,12 +239,18 @@ def clean_sha256(raw: str) -> Optional[str]:
             return None
     return raw
 
+
 def test_clean_sha256():
-    assert clean_sha256("6cc853f2ae75696b2e45f476c76b946b0fc2df7c52bb38287cb074aceb77bc7f") == "6cc853f2ae75696b2e45f476c76b946b0fc2df7c52bb38287cb074aceb77bc7f"
+    assert (
+        clean_sha256("6cc853f2ae75696b2e45f476c76b946b0fc2df7c52bb38287cb074aceb77bc7f")
+        == "6cc853f2ae75696b2e45f476c76b946b0fc2df7c52bb38287cb074aceb77bc7f"
+    )
     assert clean_sha256("0fba3fba0e1937aa0297de3836b768b5dfb23d7b") is None
 
+
 ISSN_REGEX = re.compile(r"^\d{4}-\d{3}[0-9X]$")
 
+
 def clean_issn(raw: str) -> Optional[str]:
     if not raw:
         return None
@@ -237,14 +261,17 @@ def clean_issn(raw: str) -> Optional[str]:
         return None
     return raw
 
+
 def test_clean_issn():
     assert clean_issn("1234-4567") == "1234-4567"
     assert clean_issn("1234-456X") == "1234-456X"
     assert clean_issn("134-4567") is None
     assert clean_issn("123X-4567") is None
 
+
 ISBN13_REGEX = re.compile(r"^97(?:8|9)-\d{1,5}-\d{1,7}-\d{1,6}-\d$")
 
+
 def clean_isbn13(raw: str) -> Optional[str]:
     if not raw:
         return None
@@ -253,14 +280,17 @@ def clean_isbn13(raw: str) -> Optional[str]:
         return None
     return raw
 
+
 def test_clean_isbn13():
     assert clean_isbn13("978-1-56619-909-4") == "978-1-56619-909-4"
     assert clean_isbn13("978-1-4028-9462-6") == "978-1-4028-9462-6"
     assert clean_isbn13("978-1-56619-909-4 ") == "978-1-56619-909-4"
     assert clean_isbn13("9781566199094") is None
 
+
 ORCID_REGEX = re.compile(r"^\d{4}-\d{4}-\d{4}-\d{3}[\dX]$")
 
+
 def clean_orcid(raw: str) -> Optional[str]:
     if not raw:
         return None
@@ -269,6 +299,7 @@ def clean_orcid(raw: str) -> Optional[str]:
         return None
     return raw
 
+
 def test_clean_orcid():
     assert clean_orcid("0123-4567-3456-6789") == "0123-4567-3456-6789"
     assert clean_orcid("0123-4567-3456-678X") == "0123-4567-3456-678X"
@@ -279,6 +310,7 @@ def test_clean_orcid():
 
 HDL_REGEX = re.compile(r"^\d+(\.\d+)*/\S+$")
 
+
 def clean_hdl(raw):
     if not raw:
         return None
@@ -293,14 +325,17 @@ def clean_hdl(raw):
         raw = raw[15:]
     if not HDL_REGEX.fullmatch(raw):
         return None
-    if raw.startswith('10.'):
+    if raw.startswith("10."):
         return None
     return raw
 
+
 def test_clean_hdl():
     assert clean_hdl("20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy"
     assert clean_hdl("hdl:20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy"
-    assert clean_hdl("https://hdl.handle.net/20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy"
+    assert (
+        clean_hdl("https://hdl.handle.net/20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy"
+    )
     assert clean_hdl("http://hdl.handle.net/20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy"
     assert clean_hdl("21.1234/aksjdfh") == "21.1234/aksjdfh"
     assert clean_hdl("2381/12775") == "2381/12775"
@@ -326,7 +361,7 @@ def clean_str(thing: Optional[str], force_xml: bool = False) -> Optional[str]:
     """
     if not thing:
         return None
-    unescape_html: Union[str, bool] = 'auto'
+    unescape_html: Union[str, bool] = "auto"
     if force_xml:
         unescape_html = True
     fixed = ftfy.fix_text(thing, unescape_html=unescape_html).strip()
@@ -335,15 +370,17 @@ def clean_str(thing: Optional[str], force_xml: bool = False) -> Optional[str]:
         return None
     return fixed
 
+
 def test_clean_str():
 
     assert clean_str(None) is None
-    assert clean_str('') is None
-    assert clean_str('1') is None
-    assert clean_str('123') == '123'
-    assert clean_str('a&amp;b') == 'a&b'
-    assert clean_str('<b>a&amp;b</b>') == '<b>a&amp;b</b>'
-    assert clean_str('<b>a&amp;b</b>', force_xml=True) == '<b>a&b</b>'
+    assert clean_str("") is None
+    assert clean_str("1") is None
+    assert clean_str("123") == "123"
+    assert clean_str("a&amp;b") == "a&b"
+    assert clean_str("<b>a&amp;b</b>") == "<b>a&amp;b</b>"
+    assert clean_str("<b>a&amp;b</b>", force_xml=True) == "<b>a&b</b>"
+
 
 def b32_hex(s):
     s = s.strip().split()[0].lower()
@@ -351,7 +388,8 @@ def b32_hex(s):
         s = s[5:]
     if len(s) != 32:
         return s
-    return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+    return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8")
+
 
 def is_cjk(s):
     if not s:
@@ -359,38 +397,53 @@ def is_cjk(s):
     for c in s:
         if c.isalpha():
             lang_prefix = unicodedata.name(c).split()[0]
-            return lang_prefix in ('CJK', 'HIRAGANA', 'KATAKANA', 'HANGUL')
+            return lang_prefix in ("CJK", "HIRAGANA", "KATAKANA", "HANGUL")
     return False
 
+
 def test_is_cjk():
     assert is_cjk(None) is False
-    assert is_cjk('') is False
-    assert is_cjk('blah') is False
-    assert is_cjk('岡, 鹿, 梨, 阜, 埼') is True
-    assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') is True
-    assert is_cjk('菊') is True
-    assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') is True
-    assert is_cjk('水道') is True
-    assert is_cjk('オウ, イク') is True # kanji
-    assert is_cjk('ひヒ') is True
-    assert is_cjk('き゚ゅ') is True
-    assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') is True
+    assert is_cjk("") is False
+    assert is_cjk("blah") is False
+    assert is_cjk("岡, 鹿, 梨, 阜, 埼") is True
+    assert is_cjk("[岡, 鹿, 梨, 阜, 埼]") is True
+    assert is_cjk("菊") is True
+    assert is_cjk("岡, 鹿, 梨, 阜, 埼 with eng after") is True
+    assert is_cjk("水道") is True
+    assert is_cjk("オウ, イク") is True  # kanji
+    assert is_cjk("ひヒ") is True
+    assert is_cjk("き゚ゅ") is True
+    assert is_cjk("ㄴ, ㄹ, ㅁ, ㅂ, ㅅ") is True
+
 
 MONTH_MAP = {
-    "jan":  1, "january":   1,
-    "feb":  2, "febuary":   2,
-    "mar":  3, "march":     3,
-    "apr":  4, "april":     4,
-    "may":  5, "may":       5,
-    "jun":  6, "june":      6,
-    "jul":  7, "july":      7,
-    "aug":  8, "august":    8,
-    "sep":  9, "september": 9,
-    "oct": 10, "october":   10,
-    "nov": 11, "nov":       11,
-    "dec": 12, "december":  12,
+    "jan": 1,
+    "january": 1,
+    "feb": 2,
+    "febuary": 2,
+    "mar": 3,
+    "march": 3,
+    "apr": 4,
+    "april": 4,
+    "may": 5,
+    "may": 5,
+    "jun": 6,
+    "june": 6,
+    "jul": 7,
+    "july": 7,
+    "aug": 8,
+    "august": 8,
+    "sep": 9,
+    "september": 9,
+    "oct": 10,
+    "october": 10,
+    "nov": 11,
+    "nov": 11,
+    "dec": 12,
+    "december": 12,
 }
 
+
 def parse_month(raw: Optional[str]) -> Optional[int]:
     """
     Parses a string into a month number (1 to 12)
@@ -408,6 +461,7 @@ def parse_month(raw: Optional[str]) -> Optional[int]:
         return MONTH_MAP[raw]
     return None
 
+
 def test_parse_month() -> None:
 
     assert parse_month(None) is None
@@ -417,6 +471,7 @@ def test_parse_month() -> None:
     assert parse_month("jan") == 1
     assert parse_month("September") == 9
 
+
 def detect_text_lang(raw: str) -> Optional[str]:
     """
     Tries to determine language of, eg, an abstract.
@@ -427,13 +482,14 @@ def detect_text_lang(raw: str) -> Optional[str]:
         return None
     try:
         lang = langdetect.detect(raw)
-        lang = lang.split('-')[0]
+        lang = lang.split("-")[0]
         assert len(lang) == 2
         return lang
     except (langdetect.lang_detect_exception.LangDetectException, TypeError):
         return None
     return None
 
+
 def test_detect_text_lang() -> None:
     assert detect_text_lang("") is None
     EN_SAMPLE = "this is a string of English text for testing"
@@ -444,6 +500,7 @@ def test_detect_text_lang() -> None:
     # XXX: why does this detect as `ko` sometimes?
     assert detect_text_lang(ZH_SAMPLE) in ("zh", "ko")
 
+
 def parse_lang_name(raw: Optional[str]) -> Optional[str]:
     """
     Parses a language name and returns a 2-char ISO 631 language code.
@@ -456,13 +513,14 @@ def parse_lang_name(raw: Optional[str]) -> Optional[str]:
             return None
         return lang.alpha_2.lower()
     except LookupError:
-        #print(f"  unknown language: '{raw}', file=sys.stderr)
+        # print(f"  unknown language: '{raw}', file=sys.stderr)
         return None
     except AttributeError:
-        #print(f"  partial language metadata: '{lang}', file=sys.stderr)
+        # print(f"  partial language metadata: '{lang}', file=sys.stderr)
         return None
     return None
 
+
 def test_parse_lang_name() -> None:
 
     assert parse_lang_name(None) is None
@@ -544,86 +602,85 @@ def test_parse_country_name():
     assert parse_country_name("Russia") == "ru"
     assert parse_country_name("Japan") == "jp"
 
+
 # These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of
 # 2/T and 2/B?
 # PubMed/MEDLINE and JSTOR use these MARC codes
 # https://www.loc.gov/marc/languages/language_name.html
 LANG_MAP_MARC = {
-    'afr': 'af',
-    'alb': 'sq',
-    'amh': 'am',
-    'ara': 'ar',
-    'arm': 'hy',
-    'aze': 'az',
-    'ben': 'bn',
-    'bos': 'bs',
-    'bul': 'bg',
-    'cat': 'ca',
-    'chi': 'zh',
-    'cze': 'cs',
-    'dan': 'da',
-    'dut': 'nl',
-    'eng': 'en',
-    'epo': 'eo',
-    'est': 'et',
-    'fin': 'fi',
-    'fre': 'fr',
-    'geo': 'ka',
-    'ger': 'de',
-    'gla': 'gd',
-    'gre': 'el',
-    'heb': 'he',
-    'hin': 'hi',
-    'hrv': 'hr',
-    'hun': 'hu',
-    'ice': 'is',
-    'ind': 'id',
-    'ita': 'it',
-    'jpn': 'ja',
-    'kin': 'rw',
-    'kor': 'ko',
-    'lat': 'la',
-    'lav': 'lv',
-    'lit': 'lt',
-    'mac': 'mk',
-    'mal': 'ml',
-    'mao': 'mi',
-    'may': 'ms',
-    'nor': 'no',
-    'per': 'fa',
-    'per': 'fa',
-    'pol': 'pl',
-    'por': 'pt',
-    'pus': 'ps',
-    'rum': 'ro',
-    'rus': 'ru',
-    'san': 'sa',
-    'slo': 'sk',
-    'slv': 'sl',
-    'spa': 'es',
-    'srp': 'sr',
-    'swe': 'sv',
-    'tha': 'th',
-    'tur': 'tr',
-    'ukr': 'uk',
-    'urd': 'ur',
-    'vie': 'vi',
-    'wel': 'cy',
-
-# additions
-    'gle': 'ga', # "Irish" (Gaelic)
-    'jav': 'jv', # Javanese
-    'welsh': 'cy', # Welsh
-    'oci': 'oc', # Occitan
-
-# Don't have ISO 639-1 codes
-    'grc': 'el', # Ancient Greek; map to modern greek
-    'map': None, # Austronesian (collection)
-    'syr': None, # Syriac, Modern
-    'gem': None, # Old Saxon
-    'non': None, # Old Norse
-    'emg': None, # Eastern Meohang
-    'neg': None, # Negidal
-    'mul': None, # Multiple languages
-    'und': None, # Undetermined
+    "afr": "af",
+    "alb": "sq",
+    "amh": "am",
+    "ara": "ar",
+    "arm": "hy",
+    "aze": "az",
+    "ben": "bn",
+    "bos": "bs",
+    "bul": "bg",
+    "cat": "ca",
+    "chi": "zh",
+    "cze": "cs",
+    "dan": "da",
+    "dut": "nl",
+    "eng": "en",
+    "epo": "eo",
+    "est": "et",
+    "fin": "fi",
+    "fre": "fr",
+    "geo": "ka",
+    "ger": "de",
+    "gla": "gd",
+    "gre": "el",
+    "heb": "he",
+    "hin": "hi",
+    "hrv": "hr",
+    "hun": "hu",
+    "ice": "is",
+    "ind": "id",
+    "ita": "it",
+    "jpn": "ja",
+    "kin": "rw",
+    "kor": "ko",
+    "lat": "la",
+    "lav": "lv",
+    "lit": "lt",
+    "mac": "mk",
+    "mal": "ml",
+    "mao": "mi",
+    "may": "ms",
+    "nor": "no",
+    "per": "fa",
+    "per": "fa",
+    "pol": "pl",
+    "por": "pt",
+    "pus": "ps",
+    "rum": "ro",
+    "rus": "ru",
+    "san": "sa",
+    "slo": "sk",
+    "slv": "sl",
+    "spa": "es",
+    "srp": "sr",
+    "swe": "sv",
+    "tha": "th",
+    "tur": "tr",
+    "ukr": "uk",
+    "urd": "ur",
+    "vie": "vi",
+    "wel": "cy",
+    # additions
+    "gle": "ga",  # "Irish" (Gaelic)
+    "jav": "jv",  # Javanese
+    "welsh": "cy",  # Welsh
+    "oci": "oc",  # Occitan
+    # Don't have ISO 639-1 codes
+    "grc": "el",  # Ancient Greek; map to modern greek
+    "map": None,  # Austronesian (collection)
+    "syr": None,  # Syriac, Modern
+    "gem": None,  # Old Saxon
+    "non": None,  # Old Norse
+    "emg": None,  # Eastern Meohang
+    "neg": None,  # Negidal
+    "mul": None,  # Multiple languages
+    "und": None,  # Undetermined
 }