diff options
-rw-r--r-- | python/fatcat_tools/importers/common.py | 158 | ||||
-rw-r--r-- | python/fatcat_tools/normal.py | 322 |
2 files changed, 326 insertions, 154 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 14415683..3c810391 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -3,12 +3,9 @@ import re import sys import csv import json -import ftfy -import base64 import sqlite3 import datetime import subprocess -import unicodedata from collections import Counter from confluent_kafka import Consumer, KafkaException import xml.etree.ElementTree as ET @@ -18,162 +15,13 @@ from bs4 import BeautifulSoup import fatcat_openapi_client from fatcat_openapi_client.rest import ApiException +# TODO: refactor so remove need for this (re-imports for backwards compatibility) +from fatcat_tools.normal import (clean_str as clean, is_cjk, b32_hex, LANG_MAP_MARC) # noqa: F401 DATE_FMT = "%Y-%m-%d" SANE_MAX_RELEASES = 200 SANE_MAX_URLS = 100 -# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of -# 2/T and 2/B? -# PubMed/MEDLINE and JSTOR use these MARC codes -# https://www.loc.gov/marc/languages/language_name.html -LANG_MAP_MARC = { - 'afr': 'af', - 'alb': 'sq', - 'amh': 'am', - 'ara': 'ar', - 'arm': 'hy', - 'aze': 'az', - 'ben': 'bn', - 'bos': 'bs', - 'bul': 'bg', - 'cat': 'ca', - 'chi': 'zh', - 'cze': 'cs', - 'dan': 'da', - 'dut': 'nl', - 'eng': 'en', - 'epo': 'eo', - 'est': 'et', - 'fin': 'fi', - 'fre': 'fr', - 'geo': 'ka', - 'ger': 'de', - 'gla': 'gd', - 'gre': 'el', - 'heb': 'he', - 'hin': 'hi', - 'hrv': 'hr', - 'hun': 'hu', - 'ice': 'is', - 'ind': 'id', - 'ita': 'it', - 'jpn': 'ja', - 'kin': 'rw', - 'kor': 'ko', - 'lat': 'la', - 'lav': 'lv', - 'lit': 'lt', - 'mac': 'mk', - 'mal': 'ml', - 'mao': 'mi', - 'may': 'ms', - 'nor': 'no', - 'per': 'fa', - 'per': 'fa', - 'pol': 'pl', - 'por': 'pt', - 'pus': 'ps', - 'rum': 'ro', - 'rus': 'ru', - 'san': 'sa', - 'slo': 'sk', - 'slv': 'sl', - 'spa': 'es', - 'srp': 'sr', - 'swe': 'sv', - 'tha': 'th', - 'tur': 'tr', - 'ukr': 'uk', - 'urd': 'ur', - 'vie': 'vi', - 'wel': 'cy', - -# additions - 'gle': 'ga', # "Irish" (Gaelic) - 'jav': 'jv', # Javanese - 'welsh': 'cy', # Welsh - 'oci': 'oc', # Occitan - -# Don't have ISO 639-1 codes - 'grc': 'el', # Ancient Greek; map to modern greek - 'map': None, # Austronesian (collection) - 'syr': None, # Syriac, Modern - 'gem': None, # Old Saxon - 'non': None, # Old Norse - 'emg': None, # Eastern Meohang - 'neg': None, # Negidal - 'mul': None, # Multiple languages - 'und': None, # Undetermined -} - - -def clean(thing, force_xml=False): - """ - This function is appropriate to be called on any random, non-markup string, - such as author names, titles, etc. - - It will try to clean up common unicode mangles, HTML characters, etc. - - This will detect XML/HTML and "do the right thing" (aka, not remove - entities like '&' if there are tags in the string), unless you pass the - 'force_xml' parameter, which might be appropriate for, eg, names and - titles, which generally should be projected down to plain text. - - Also strips extra whitespace. - """ - if not thing: - return None - fix_entities = 'auto' - if force_xml: - fix_entities = True - fixed = ftfy.fix_text(thing, fix_entities=fix_entities).strip() - if not fixed or len(fixed) <= 1: - # wasn't zero-length before, but is now; return None - return None - return fixed - -def test_clean(): - - assert clean(None) == None - assert clean('') == None - assert clean('1') == None - assert clean('123') == '123' - assert clean('a&b') == 'a&b' - assert clean('<b>a&b</b>') == '<b>a&b</b>' - assert clean('<b>a&b</b>', force_xml=True) == '<b>a&b</b>' - -def b32_hex(s): - s = s.strip().split()[0].lower() - if s.startswith("sha1:"): - s = s[5:] - if len(s) != 32: - return s - return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') - -def is_cjk(s): - if not s: - return False - for c in s: - if c.isalpha(): - lang_prefix = unicodedata.name(c).split()[0] - return lang_prefix in ('CJK', 'HIRAGANA', 'KATAKANA', 'HANGUL') - return False - -def test_is_cjk(): - assert is_cjk(None) is False - assert is_cjk('') is False - assert is_cjk('blah') is False - assert is_cjk('岡, 鹿, 梨, 阜, 埼') is True - assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') is True - assert is_cjk('菊') is True - assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') is True - assert is_cjk('水道') is True - assert is_cjk('オウ, イク') is True # kanji - assert is_cjk('ひヒ') is True - assert is_cjk('き゚ゅ') is True - assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') is True - DOMAIN_REL_MAP = { "archive.org": "archive", # LOCKSS, Portico, DuraSpace, etc would also be "archive" @@ -444,6 +292,7 @@ class EntityImporter: raise NotImplementedError def is_orcid(self, orcid): + # TODO: replace with clean_orcid() from fatcat_tools.normal return self._orcid_regex.match(orcid) is not None def lookup_orcid(self, orcid): @@ -464,6 +313,7 @@ class EntityImporter: return creator_id def is_doi(self, doi): + # TODO: replace with clean_doi() from fatcat_tools.normal return doi.startswith("10.") and doi.count("/") >= 1 def lookup_doi(self, doi): diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py index 10a90dba..39927651 100644 --- a/python/fatcat_tools/normal.py +++ b/python/fatcat_tools/normal.py @@ -5,6 +5,13 @@ free-form input, titles, etc. """ import re +import base64 +from typing import Optional +import unicodedata + +import ftfy +import langdetect +import pycountry DOI_REGEX = re.compile(r"^10.\d{3,6}/\S+$") @@ -233,3 +240,318 @@ def test_clean_orcid(): assert clean_orcid("0123-4567-3456-6789 ") == "0123-4567-3456-6789" assert clean_orcid("01234567-3456-6780") == None assert clean_orcid("0x23-4567-3456-6780") == None + + +def clean_str(thing: Optional[str], force_xml: bool = False) -> Optional[str]: + """ + This function is appropriate to be called on any random, non-markup string, + such as author names, titles, etc. + + It will try to clean up common unicode mangles, HTML characters, etc. + + This will detect XML/HTML and "do the right thing" (aka, not remove + entities like '&' if there are tags in the string), unless you pass the + 'force_xml' parameter, which might be appropriate for, eg, names and + titles, which generally should be projected down to plain text. + + Also strips extra whitespace. + """ + if not thing: + return None + fix_entities = 'auto' + if force_xml: + fix_entities = True + fixed = ftfy.fix_text(thing, fix_entities=fix_entities).strip() + if not fixed or len(fixed) <= 1: + # wasn't zero-length before, but is now; return None + return None + return fixed + +def test_clean_str(): + + assert clean_str(None) == None + assert clean_str('') == None + assert clean_str('1') == None + assert clean_str('123') == '123' + assert clean_str('a&b') == 'a&b' + assert clean_str('<b>a&b</b>') == '<b>a&b</b>' + assert clean_str('<b>a&b</b>', force_xml=True) == '<b>a&b</b>' + +def b32_hex(s): + s = s.strip().split()[0].lower() + if s.startswith("sha1:"): + s = s[5:] + if len(s) != 32: + return s + return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') + +def is_cjk(s): + if not s: + return False + for c in s: + if c.isalpha(): + lang_prefix = unicodedata.name(c).split()[0] + return lang_prefix in ('CJK', 'HIRAGANA', 'KATAKANA', 'HANGUL') + return False + +def test_is_cjk(): + assert is_cjk(None) is False + assert is_cjk('') is False + assert is_cjk('blah') is False + assert is_cjk('岡, 鹿, 梨, 阜, 埼') is True + assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') is True + assert is_cjk('菊') is True + assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') is True + assert is_cjk('水道') is True + assert is_cjk('オウ, イク') is True # kanji + assert is_cjk('ひヒ') is True + assert is_cjk('き゚ゅ') is True + assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') is True + +MONTH_MAP = { + "jan": 1, "january": 1, + "feb": 2, "febuary": 2, + "mar": 3, "march": 3, + "apr": 4, "april": 4, + "may": 5, "may": 5, + "jun": 6, "june": 6, + "jul": 7, "july": 7, + "aug": 8, "august": 8, + "sep": 9, "september": 9, + "oct": 10, "october": 10, + "nov": 11, "nov": 11, + "dec": 12, "december": 12, +} + +def parse_month(raw: Optional[str]) -> Optional[int]: + """ + Parses a string into a month number (1 to 12) + """ + if not raw: + return None + raw = raw.strip().lower() + if raw.isdigit(): + raw_int = int(raw) + if raw_int >= 1 and raw_int <= 12: + return raw_int + else: + return None + if raw in MONTH_MAP: + return MONTH_MAP[raw] + return None + +def test_parse_month() -> None: + + assert parse_month(None) == None + assert parse_month("") == None + assert parse_month("0") == None + assert parse_month("10") == 10 + assert parse_month("jan") == 1 + assert parse_month("September") == 9 + +def detect_text_lang(raw: str) -> Optional[str]: + """ + Tries to determine language of, eg, an abstract. + + Returns an ISO 631 2-char language code, or None. + """ + if not raw: + return None + try: + lang = langdetect.detect(raw) + assert len(lang) == 2 + return lang + except (langdetect.lang_detect_exception.LangDetectException, TypeError): + return None + return None + +def test_detect_text_lang() -> None: + assert detect_text_lang("") == None + EN_SAMPLE = "this is a string of English text for testing" + assert detect_text_lang(EN_SAMPLE) == "en" + JA_SAMPLE = "モーラの種類は、以下に示すように111程度存在する。ただし、研究者により数え方が少しずつ異なる。" + assert detect_text_lang(JA_SAMPLE) == "ja" + +def parse_lang_name(raw: Optional[str]) -> Optional[str]: + """ + Parses a language name and returns a 2-char ISO 631 language code. + """ + if not raw: + return None + try: + lang = pycountry.languages.lookup(raw) + if lang.alpha_3 in ("mul", "mis"): + return None + return lang.alpha_2.lower() + except LookupError: + #print(f" unknown language: '{raw}', file=sys.stderr) + return None + except AttributeError: + #print(f" partial language metadata: '{lang}', file=sys.stderr) + return None + return None + +def test_parse_lang_name() -> None: + + assert parse_lang_name(None) == None + assert parse_lang_name("") == None + assert parse_lang_name("asdf ") == None + assert parse_lang_name("english") == "en" + assert parse_lang_name("ENGLISH") == "en" + assert parse_lang_name("asdf blah") is None + assert parse_lang_name("en") == "en" + assert parse_lang_name("EN") == "en" + assert parse_lang_name("ENG") == "en" + assert parse_lang_name("English") == "en" + assert parse_lang_name("Portuguese") == "pt" + + +def parse_country_name(s: Optional[str]) -> Optional[str]: + """ + Parses a country name into a ISO country code (2-char). + + This version copied from the chocula repository. + """ + if not s or s in ("Unknown"): + return None + + s = s.strip() + if s.lower() in ("usa", "new york (state)", "washington (state)"): + return "us" + if s.lower() in ("russia (federation)", "russia"): + return "ru" + if s == "Québec (Province)": + s = "Canada" + if s == "China (Republic : 1949- )": + return "tw" + if s == "Brunei": + return "bn" + if s.startswith("Congo "): + s = "Congo" + if s.lower() == "iran": + return "ir" + if s.lower() == "bermuda islands": + return "bm" + if s.lower() == "burma": + s = "myanmar" + if s.lower() in ("korea (south)", "south korea"): + return "kr" + if s.lower() in ("england", "scotland", "wales"): + return "uk" + s = s.replace(" (Republic)", "").replace(" (Federation)", "") + + try: + country = pycountry.countries.lookup(s) + except LookupError: + country = None + + if country: + return country.alpha_2.lower() + try: + sub = pycountry.subdivisions.lookup(s) + except LookupError: + sub = None + + s = s.replace(" (State)", "").replace(" (Province)", "") + if sub: + return sub.country_code.lower() + + else: + # print(f"unknown country: {s}", file=sys.stderr) + return None + + +def test_parse_country_name(): + assert parse_country_name("") is None + assert parse_country_name("asdf blah") is None + assert parse_country_name("us") == "us" + assert parse_country_name("USA") == "us" + assert parse_country_name("United States of America") == "us" + assert parse_country_name("united States") == "us" + assert parse_country_name("Massachusetts") == "us" + assert parse_country_name("Russia") == "ru" + assert parse_country_name("Japan") == "jp" + +# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of +# 2/T and 2/B? +# PubMed/MEDLINE and JSTOR use these MARC codes +# https://www.loc.gov/marc/languages/language_name.html +LANG_MAP_MARC = { + 'afr': 'af', + 'alb': 'sq', + 'amh': 'am', + 'ara': 'ar', + 'arm': 'hy', + 'aze': 'az', + 'ben': 'bn', + 'bos': 'bs', + 'bul': 'bg', + 'cat': 'ca', + 'chi': 'zh', + 'cze': 'cs', + 'dan': 'da', + 'dut': 'nl', + 'eng': 'en', + 'epo': 'eo', + 'est': 'et', + 'fin': 'fi', + 'fre': 'fr', + 'geo': 'ka', + 'ger': 'de', + 'gla': 'gd', + 'gre': 'el', + 'heb': 'he', + 'hin': 'hi', + 'hrv': 'hr', + 'hun': 'hu', + 'ice': 'is', + 'ind': 'id', + 'ita': 'it', + 'jpn': 'ja', + 'kin': 'rw', + 'kor': 'ko', + 'lat': 'la', + 'lav': 'lv', + 'lit': 'lt', + 'mac': 'mk', + 'mal': 'ml', + 'mao': 'mi', + 'may': 'ms', + 'nor': 'no', + 'per': 'fa', + 'per': 'fa', + 'pol': 'pl', + 'por': 'pt', + 'pus': 'ps', + 'rum': 'ro', + 'rus': 'ru', + 'san': 'sa', + 'slo': 'sk', + 'slv': 'sl', + 'spa': 'es', + 'srp': 'sr', + 'swe': 'sv', + 'tha': 'th', + 'tur': 'tr', + 'ukr': 'uk', + 'urd': 'ur', + 'vie': 'vi', + 'wel': 'cy', + +# additions + 'gle': 'ga', # "Irish" (Gaelic) + 'jav': 'jv', # Javanese + 'welsh': 'cy', # Welsh + 'oci': 'oc', # Occitan + +# Don't have ISO 639-1 codes + 'grc': 'el', # Ancient Greek; map to modern greek + 'map': None, # Austronesian (collection) + 'syr': None, # Syriac, Modern + 'gem': None, # Old Saxon + 'non': None, # Old Norse + 'emg': None, # Eastern Meohang + 'neg': None, # Negidal + 'mul': None, # Multiple languages + 'und': None, # Undetermined +} |