diff options
Diffstat (limited to 'python/fatcat_tools/normal.py')
-rw-r--r-- | python/fatcat_tools/normal.py | 301 |
1 files changed, 179 insertions, 122 deletions
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py index 9b65e768..12c58829 100644 --- a/python/fatcat_tools/normal.py +++ b/python/fatcat_tools/normal.py @@ -1,4 +1,3 @@ - """ A bunch of helpers to parse and normalize strings: external identifiers, free-form input, titles, etc. @@ -32,7 +31,7 @@ def clean_doi(raw: str) -> Optional[str]: if not raw: return None raw = raw.strip().lower() - if '\u2013' in raw: + if "\u2013" in raw: # Do not attempt to normalize "en dash" and since FC does not allow # unicode in DOI, treat this as invalid. return None @@ -54,7 +53,7 @@ def clean_doi(raw: str) -> Optional[str]: # fatcatd uses same REGEX, but Rust regex rejects these characters, while # python doesn't. DOIs are syntaxtually valid, but very likely to be typos; # for now filter them out. - for c in ('¬', ): + for c in ("¬",): if c in raw: return None @@ -70,6 +69,7 @@ def clean_doi(raw: str) -> Optional[str]: return None return raw + def test_clean_doi(): assert clean_doi("10.1234/asdf ") == "10.1234/asdf" assert clean_doi("10.1037//0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50" @@ -81,7 +81,9 @@ def test_clean_doi(): assert clean_doi("doi:10.1234/asdf ") == "10.1234/asdf" assert clean_doi("doi:10.1234/ asdf ") is None assert clean_doi("10.4149/gpb¬_2017042") is None # "logical negation" character - assert clean_doi("10.6002/ect.2020.häyry") is None # this example via pubmed (pmid:32519616) + assert ( + clean_doi("10.6002/ect.2020.häyry") is None + ) # this example via pubmed (pmid:32519616) assert clean_doi("10.30466/vrf.2019.98547.2350\u200e") is None assert clean_doi("10.12016/j.issn.2096⁃1456.2017.06.014") is None assert clean_doi("10.4025/diálogos.v17i2.36030") is None @@ -92,6 +94,7 @@ def test_clean_doi(): ARXIV_ID_REGEX = re.compile(r"^(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?/\d{7})(v\d+)?$") + def clean_arxiv_id(raw: str) -> Optional[str]: """ Removes any: @@ -113,6 +116,7 @@ def clean_arxiv_id(raw: str) -> Optional[str]: return None return raw + def test_clean_arxiv_id(): assert clean_arxiv_id("0806.2878v1") == "0806.2878v1" assert clean_arxiv_id("0806.2878") == "0806.2878" @@ -141,16 +145,18 @@ def test_clean_arxiv_id(): assert clean_arxiv_id("0806.v1") is None assert clean_arxiv_id("08062878v1") is None + def clean_wikidata_qid(raw): if not raw: return None raw = raw.strip() if len(raw.split()) != 1 or len(raw) < 2: return None - if raw[0] == 'Q' and raw[1] != '0' and raw[1:].isdigit(): + if raw[0] == "Q" and raw[1] != "0" and raw[1:].isdigit(): return raw return None + def test_clean_wikidata_qid(): assert clean_wikidata_qid("Q1234") == "Q1234" assert clean_wikidata_qid("Q1") == "Q1" @@ -163,6 +169,7 @@ def test_clean_wikidata_qid(): assert clean_wikidata_qid("qfba3") is None assert clean_wikidata_qid("") is None + def clean_pmid(raw: str) -> Optional[str]: if not raw: return None @@ -173,6 +180,7 @@ def clean_pmid(raw: str) -> Optional[str]: return raw return None + def test_clean_pmid(): assert clean_pmid("1234") == "1234" assert clean_pmid("1234 ") == "1234" @@ -180,6 +188,7 @@ def test_clean_pmid(): assert clean_pmid("qfba3") is None assert clean_pmid("") is None + def clean_pmcid(raw: str) -> Optional[str]: if not raw: return None @@ -190,6 +199,7 @@ def clean_pmcid(raw: str) -> Optional[str]: return raw return None + def clean_sha1(raw: str) -> Optional[str]: if not raw: return None @@ -203,13 +213,21 @@ def clean_sha1(raw: str) -> Optional[str]: return None return raw + def test_clean_sha1(): - assert clean_sha1("0fba3fba0e1937aa0297de3836b768b5dfb23d7b") == "0fba3fba0e1937aa0297de3836b768b5dfb23d7b" - assert clean_sha1("0fba3fba0e1937aa0297de3836b768b5dfb23d7b ") == "0fba3fba0e1937aa0297de3836b768b5dfb23d7b" + assert ( + clean_sha1("0fba3fba0e1937aa0297de3836b768b5dfb23d7b") + == "0fba3fba0e1937aa0297de3836b768b5dfb23d7b" + ) + assert ( + clean_sha1("0fba3fba0e1937aa0297de3836b768b5dfb23d7b ") + == "0fba3fba0e1937aa0297de3836b768b5dfb23d7b" + ) assert clean_sha1("fba3fba0e1937aa0297de3836b768b5dfb23d7b") is None assert clean_sha1("qfba3fba0e1937aa0297de3836b768b5dfb23d7b") is None assert clean_sha1("0fba3fb a0e1937aa0297de3836b768b5dfb23d7b") is None + def clean_sha256(raw: str) -> Optional[str]: raw = raw.strip().lower() if len(raw.split()) != 1: @@ -221,12 +239,18 @@ def clean_sha256(raw: str) -> Optional[str]: return None return raw + def test_clean_sha256(): - assert clean_sha256("6cc853f2ae75696b2e45f476c76b946b0fc2df7c52bb38287cb074aceb77bc7f") == "6cc853f2ae75696b2e45f476c76b946b0fc2df7c52bb38287cb074aceb77bc7f" + assert ( + clean_sha256("6cc853f2ae75696b2e45f476c76b946b0fc2df7c52bb38287cb074aceb77bc7f") + == "6cc853f2ae75696b2e45f476c76b946b0fc2df7c52bb38287cb074aceb77bc7f" + ) assert clean_sha256("0fba3fba0e1937aa0297de3836b768b5dfb23d7b") is None + ISSN_REGEX = re.compile(r"^\d{4}-\d{3}[0-9X]$") + def clean_issn(raw: str) -> Optional[str]: if not raw: return None @@ -237,14 +261,17 @@ def clean_issn(raw: str) -> Optional[str]: return None return raw + def test_clean_issn(): assert clean_issn("1234-4567") == "1234-4567" assert clean_issn("1234-456X") == "1234-456X" assert clean_issn("134-4567") is None assert clean_issn("123X-4567") is None + ISBN13_REGEX = re.compile(r"^97(?:8|9)-\d{1,5}-\d{1,7}-\d{1,6}-\d$") + def clean_isbn13(raw: str) -> Optional[str]: if not raw: return None @@ -253,14 +280,17 @@ def clean_isbn13(raw: str) -> Optional[str]: return None return raw + def test_clean_isbn13(): assert clean_isbn13("978-1-56619-909-4") == "978-1-56619-909-4" assert clean_isbn13("978-1-4028-9462-6") == "978-1-4028-9462-6" assert clean_isbn13("978-1-56619-909-4 ") == "978-1-56619-909-4" assert clean_isbn13("9781566199094") is None + ORCID_REGEX = re.compile(r"^\d{4}-\d{4}-\d{4}-\d{3}[\dX]$") + def clean_orcid(raw: str) -> Optional[str]: if not raw: return None @@ -269,6 +299,7 @@ def clean_orcid(raw: str) -> Optional[str]: return None return raw + def test_clean_orcid(): assert clean_orcid("0123-4567-3456-6789") == "0123-4567-3456-6789" assert clean_orcid("0123-4567-3456-678X") == "0123-4567-3456-678X" @@ -279,6 +310,7 @@ def test_clean_orcid(): HDL_REGEX = re.compile(r"^\d+(\.\d+)*/\S+$") + def clean_hdl(raw): if not raw: return None @@ -293,14 +325,17 @@ def clean_hdl(raw): raw = raw[15:] if not HDL_REGEX.fullmatch(raw): return None - if raw.startswith('10.'): + if raw.startswith("10."): return None return raw + def test_clean_hdl(): assert clean_hdl("20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy" assert clean_hdl("hdl:20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy" - assert clean_hdl("https://hdl.handle.net/20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy" + assert ( + clean_hdl("https://hdl.handle.net/20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy" + ) assert clean_hdl("http://hdl.handle.net/20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy" assert clean_hdl("21.1234/aksjdfh") == "21.1234/aksjdfh" assert clean_hdl("2381/12775") == "2381/12775" @@ -326,7 +361,7 @@ def clean_str(thing: Optional[str], force_xml: bool = False) -> Optional[str]: """ if not thing: return None - unescape_html: Union[str, bool] = 'auto' + unescape_html: Union[str, bool] = "auto" if force_xml: unescape_html = True fixed = ftfy.fix_text(thing, unescape_html=unescape_html).strip() @@ -335,15 +370,17 @@ def clean_str(thing: Optional[str], force_xml: bool = False) -> Optional[str]: return None return fixed + def test_clean_str(): assert clean_str(None) is None - assert clean_str('') is None - assert clean_str('1') is None - assert clean_str('123') == '123' - assert clean_str('a&b') == 'a&b' - assert clean_str('<b>a&b</b>') == '<b>a&b</b>' - assert clean_str('<b>a&b</b>', force_xml=True) == '<b>a&b</b>' + assert clean_str("") is None + assert clean_str("1") is None + assert clean_str("123") == "123" + assert clean_str("a&b") == "a&b" + assert clean_str("<b>a&b</b>") == "<b>a&b</b>" + assert clean_str("<b>a&b</b>", force_xml=True) == "<b>a&b</b>" + def b32_hex(s): s = s.strip().split()[0].lower() @@ -351,7 +388,8 @@ def b32_hex(s): s = s[5:] if len(s) != 32: return s - return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8') + return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8") + def is_cjk(s): if not s: @@ -359,38 +397,53 @@ def is_cjk(s): for c in s: if c.isalpha(): lang_prefix = unicodedata.name(c).split()[0] - return lang_prefix in ('CJK', 'HIRAGANA', 'KATAKANA', 'HANGUL') + return lang_prefix in ("CJK", "HIRAGANA", "KATAKANA", "HANGUL") return False + def test_is_cjk(): assert is_cjk(None) is False - assert is_cjk('') is False - assert is_cjk('blah') is False - assert is_cjk('岡, 鹿, 梨, 阜, 埼') is True - assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') is True - assert is_cjk('菊') is True - assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') is True - assert is_cjk('水道') is True - assert is_cjk('オウ, イク') is True # kanji - assert is_cjk('ひヒ') is True - assert is_cjk('き゚ゅ') is True - assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') is True + assert is_cjk("") is False + assert is_cjk("blah") is False + assert is_cjk("岡, 鹿, 梨, 阜, 埼") is True + assert is_cjk("[岡, 鹿, 梨, 阜, 埼]") is True + assert is_cjk("菊") is True + assert is_cjk("岡, 鹿, 梨, 阜, 埼 with eng after") is True + assert is_cjk("水道") is True + assert is_cjk("オウ, イク") is True # kanji + assert is_cjk("ひヒ") is True + assert is_cjk("き゚ゅ") is True + assert is_cjk("ㄴ, ㄹ, ㅁ, ㅂ, ㅅ") is True + MONTH_MAP = { - "jan": 1, "january": 1, - "feb": 2, "febuary": 2, - "mar": 3, "march": 3, - "apr": 4, "april": 4, - "may": 5, "may": 5, - "jun": 6, "june": 6, - "jul": 7, "july": 7, - "aug": 8, "august": 8, - "sep": 9, "september": 9, - "oct": 10, "october": 10, - "nov": 11, "nov": 11, - "dec": 12, "december": 12, + "jan": 1, + "january": 1, + "feb": 2, + "febuary": 2, + "mar": 3, + "march": 3, + "apr": 4, + "april": 4, + "may": 5, + "may": 5, + "jun": 6, + "june": 6, + "jul": 7, + "july": 7, + "aug": 8, + "august": 8, + "sep": 9, + "september": 9, + "oct": 10, + "october": 10, + "nov": 11, + "nov": 11, + "dec": 12, + "december": 12, } + def parse_month(raw: Optional[str]) -> Optional[int]: """ Parses a string into a month number (1 to 12) @@ -408,6 +461,7 @@ def parse_month(raw: Optional[str]) -> Optional[int]: return MONTH_MAP[raw] return None + def test_parse_month() -> None: assert parse_month(None) is None @@ -417,6 +471,7 @@ def test_parse_month() -> None: assert parse_month("jan") == 1 assert parse_month("September") == 9 + def detect_text_lang(raw: str) -> Optional[str]: """ Tries to determine language of, eg, an abstract. @@ -427,13 +482,14 @@ def detect_text_lang(raw: str) -> Optional[str]: return None try: lang = langdetect.detect(raw) - lang = lang.split('-')[0] + lang = lang.split("-")[0] assert len(lang) == 2 return lang except (langdetect.lang_detect_exception.LangDetectException, TypeError): return None return None + def test_detect_text_lang() -> None: assert detect_text_lang("") is None EN_SAMPLE = "this is a string of English text for testing" @@ -444,6 +500,7 @@ def test_detect_text_lang() -> None: # XXX: why does this detect as `ko` sometimes? assert detect_text_lang(ZH_SAMPLE) in ("zh", "ko") + def parse_lang_name(raw: Optional[str]) -> Optional[str]: """ Parses a language name and returns a 2-char ISO 631 language code. @@ -456,13 +513,14 @@ def parse_lang_name(raw: Optional[str]) -> Optional[str]: return None return lang.alpha_2.lower() except LookupError: - #print(f" unknown language: '{raw}', file=sys.stderr) + # print(f" unknown language: '{raw}', file=sys.stderr) return None except AttributeError: - #print(f" partial language metadata: '{lang}', file=sys.stderr) + # print(f" partial language metadata: '{lang}', file=sys.stderr) return None return None + def test_parse_lang_name() -> None: assert parse_lang_name(None) is None @@ -544,86 +602,85 @@ def test_parse_country_name(): assert parse_country_name("Russia") == "ru" assert parse_country_name("Japan") == "jp" + # These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of # 2/T and 2/B? # PubMed/MEDLINE and JSTOR use these MARC codes # https://www.loc.gov/marc/languages/language_name.html LANG_MAP_MARC = { - 'afr': 'af', - 'alb': 'sq', - 'amh': 'am', - 'ara': 'ar', - 'arm': 'hy', - 'aze': 'az', - 'ben': 'bn', - 'bos': 'bs', - 'bul': 'bg', - 'cat': 'ca', - 'chi': 'zh', - 'cze': 'cs', - 'dan': 'da', - 'dut': 'nl', - 'eng': 'en', - 'epo': 'eo', - 'est': 'et', - 'fin': 'fi', - 'fre': 'fr', - 'geo': 'ka', - 'ger': 'de', - 'gla': 'gd', - 'gre': 'el', - 'heb': 'he', - 'hin': 'hi', - 'hrv': 'hr', - 'hun': 'hu', - 'ice': 'is', - 'ind': 'id', - 'ita': 'it', - 'jpn': 'ja', - 'kin': 'rw', - 'kor': 'ko', - 'lat': 'la', - 'lav': 'lv', - 'lit': 'lt', - 'mac': 'mk', - 'mal': 'ml', - 'mao': 'mi', - 'may': 'ms', - 'nor': 'no', - 'per': 'fa', - 'per': 'fa', - 'pol': 'pl', - 'por': 'pt', - 'pus': 'ps', - 'rum': 'ro', - 'rus': 'ru', - 'san': 'sa', - 'slo': 'sk', - 'slv': 'sl', - 'spa': 'es', - 'srp': 'sr', - 'swe': 'sv', - 'tha': 'th', - 'tur': 'tr', - 'ukr': 'uk', - 'urd': 'ur', - 'vie': 'vi', - 'wel': 'cy', - -# additions - 'gle': 'ga', # "Irish" (Gaelic) - 'jav': 'jv', # Javanese - 'welsh': 'cy', # Welsh - 'oci': 'oc', # Occitan - -# Don't have ISO 639-1 codes - 'grc': 'el', # Ancient Greek; map to modern greek - 'map': None, # Austronesian (collection) - 'syr': None, # Syriac, Modern - 'gem': None, # Old Saxon - 'non': None, # Old Norse - 'emg': None, # Eastern Meohang - 'neg': None, # Negidal - 'mul': None, # Multiple languages - 'und': None, # Undetermined + "afr": "af", + "alb": "sq", + "amh": "am", + "ara": "ar", + "arm": "hy", + "aze": "az", + "ben": "bn", + "bos": "bs", + "bul": "bg", + "cat": "ca", + "chi": "zh", + "cze": "cs", + "dan": "da", + "dut": "nl", + "eng": "en", + "epo": "eo", + "est": "et", + "fin": "fi", + "fre": "fr", + "geo": "ka", + "ger": "de", + "gla": "gd", + "gre": "el", + "heb": "he", + "hin": "hi", + "hrv": "hr", + "hun": "hu", + "ice": "is", + "ind": "id", + "ita": "it", + "jpn": "ja", + "kin": "rw", + "kor": "ko", + "lat": "la", + "lav": "lv", + "lit": "lt", + "mac": "mk", + "mal": "ml", + "mao": "mi", + "may": "ms", + "nor": "no", + "per": "fa", + "per": "fa", + "pol": "pl", + "por": "pt", + "pus": "ps", + "rum": "ro", + "rus": "ru", + "san": "sa", + "slo": "sk", + "slv": "sl", + "spa": "es", + "srp": "sr", + "swe": "sv", + "tha": "th", + "tur": "tr", + "ukr": "uk", + "urd": "ur", + "vie": "vi", + "wel": "cy", + # additions + "gle": "ga", # "Irish" (Gaelic) + "jav": "jv", # Javanese + "welsh": "cy", # Welsh + "oci": "oc", # Occitan + # Don't have ISO 639-1 codes + "grc": "el", # Ancient Greek; map to modern greek + "map": None, # Austronesian (collection) + "syr": None, # Syriac, Modern + "gem": None, # Old Saxon + "non": None, # Old Norse + "emg": None, # Eastern Meohang + "neg": None, # Negidal + "mul": None, # Multiple languages + "und": None, # Undetermined } |