aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-11-17 17:47:50 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-11-19 14:55:15 -0800
commit90b336ec3fe2cf34b0cbbbf5717aa3883af8685e (patch)
treebbe4ef307698b3c6b1854e5195dfef2c4ad02c2e
parente6c92c88e7ce266934167f220a847a20f0f97872 (diff)
downloadfatcat-90b336ec3fe2cf34b0cbbbf5717aa3883af8685e.tar.gz
fatcat-90b336ec3fe2cf34b0cbbbf5717aa3883af8685e.zip
more python normalizers, and move from importer common
Moved several normalizer helpers out of fatcat_tools.importers.common to fatcat_tools.normal. Copied language name and country name parser helpers from chocula repository (built on existing pycountry helper library). Have not gone through and refactored other importers to point to these helpers yet; that should be a separate PR when this branch is merged. Current changes are backwards compatible via re-imports.
-rw-r--r--python/fatcat_tools/importers/common.py158
-rw-r--r--python/fatcat_tools/normal.py322
2 files changed, 326 insertions, 154 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 14415683..3c810391 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -3,12 +3,9 @@ import re
import sys
import csv
import json
-import ftfy
-import base64
import sqlite3
import datetime
import subprocess
-import unicodedata
from collections import Counter
from confluent_kafka import Consumer, KafkaException
import xml.etree.ElementTree as ET
@@ -18,162 +15,13 @@ from bs4 import BeautifulSoup
import fatcat_openapi_client
from fatcat_openapi_client.rest import ApiException
+# TODO: refactor so remove need for this (re-imports for backwards compatibility)
+from fatcat_tools.normal import (clean_str as clean, is_cjk, b32_hex, LANG_MAP_MARC) # noqa: F401
DATE_FMT = "%Y-%m-%d"
SANE_MAX_RELEASES = 200
SANE_MAX_URLS = 100
-# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of
-# 2/T and 2/B?
-# PubMed/MEDLINE and JSTOR use these MARC codes
-# https://www.loc.gov/marc/languages/language_name.html
-LANG_MAP_MARC = {
- 'afr': 'af',
- 'alb': 'sq',
- 'amh': 'am',
- 'ara': 'ar',
- 'arm': 'hy',
- 'aze': 'az',
- 'ben': 'bn',
- 'bos': 'bs',
- 'bul': 'bg',
- 'cat': 'ca',
- 'chi': 'zh',
- 'cze': 'cs',
- 'dan': 'da',
- 'dut': 'nl',
- 'eng': 'en',
- 'epo': 'eo',
- 'est': 'et',
- 'fin': 'fi',
- 'fre': 'fr',
- 'geo': 'ka',
- 'ger': 'de',
- 'gla': 'gd',
- 'gre': 'el',
- 'heb': 'he',
- 'hin': 'hi',
- 'hrv': 'hr',
- 'hun': 'hu',
- 'ice': 'is',
- 'ind': 'id',
- 'ita': 'it',
- 'jpn': 'ja',
- 'kin': 'rw',
- 'kor': 'ko',
- 'lat': 'la',
- 'lav': 'lv',
- 'lit': 'lt',
- 'mac': 'mk',
- 'mal': 'ml',
- 'mao': 'mi',
- 'may': 'ms',
- 'nor': 'no',
- 'per': 'fa',
- 'per': 'fa',
- 'pol': 'pl',
- 'por': 'pt',
- 'pus': 'ps',
- 'rum': 'ro',
- 'rus': 'ru',
- 'san': 'sa',
- 'slo': 'sk',
- 'slv': 'sl',
- 'spa': 'es',
- 'srp': 'sr',
- 'swe': 'sv',
- 'tha': 'th',
- 'tur': 'tr',
- 'ukr': 'uk',
- 'urd': 'ur',
- 'vie': 'vi',
- 'wel': 'cy',
-
-# additions
- 'gle': 'ga', # "Irish" (Gaelic)
- 'jav': 'jv', # Javanese
- 'welsh': 'cy', # Welsh
- 'oci': 'oc', # Occitan
-
-# Don't have ISO 639-1 codes
- 'grc': 'el', # Ancient Greek; map to modern greek
- 'map': None, # Austronesian (collection)
- 'syr': None, # Syriac, Modern
- 'gem': None, # Old Saxon
- 'non': None, # Old Norse
- 'emg': None, # Eastern Meohang
- 'neg': None, # Negidal
- 'mul': None, # Multiple languages
- 'und': None, # Undetermined
-}
-
-
-def clean(thing, force_xml=False):
- """
- This function is appropriate to be called on any random, non-markup string,
- such as author names, titles, etc.
-
- It will try to clean up common unicode mangles, HTML characters, etc.
-
- This will detect XML/HTML and "do the right thing" (aka, not remove
- entities like '&amp' if there are tags in the string), unless you pass the
- 'force_xml' parameter, which might be appropriate for, eg, names and
- titles, which generally should be projected down to plain text.
-
- Also strips extra whitespace.
- """
- if not thing:
- return None
- fix_entities = 'auto'
- if force_xml:
- fix_entities = True
- fixed = ftfy.fix_text(thing, fix_entities=fix_entities).strip()
- if not fixed or len(fixed) <= 1:
- # wasn't zero-length before, but is now; return None
- return None
- return fixed
-
-def test_clean():
-
- assert clean(None) == None
- assert clean('') == None
- assert clean('1') == None
- assert clean('123') == '123'
- assert clean('a&amp;b') == 'a&b'
- assert clean('<b>a&amp;b</b>') == '<b>a&amp;b</b>'
- assert clean('<b>a&amp;b</b>', force_xml=True) == '<b>a&b</b>'
-
-def b32_hex(s):
- s = s.strip().split()[0].lower()
- if s.startswith("sha1:"):
- s = s[5:]
- if len(s) != 32:
- return s
- return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
-
-def is_cjk(s):
- if not s:
- return False
- for c in s:
- if c.isalpha():
- lang_prefix = unicodedata.name(c).split()[0]
- return lang_prefix in ('CJK', 'HIRAGANA', 'KATAKANA', 'HANGUL')
- return False
-
-def test_is_cjk():
- assert is_cjk(None) is False
- assert is_cjk('') is False
- assert is_cjk('blah') is False
- assert is_cjk('岡, 鹿, 梨, 阜, 埼') is True
- assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') is True
- assert is_cjk('菊') is True
- assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') is True
- assert is_cjk('水道') is True
- assert is_cjk('オウ, イク') is True # kanji
- assert is_cjk('ひヒ') is True
- assert is_cjk('き゚ゅ') is True
- assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') is True
-
DOMAIN_REL_MAP = {
"archive.org": "archive",
# LOCKSS, Portico, DuraSpace, etc would also be "archive"
@@ -444,6 +292,7 @@ class EntityImporter:
raise NotImplementedError
def is_orcid(self, orcid):
+ # TODO: replace with clean_orcid() from fatcat_tools.normal
return self._orcid_regex.match(orcid) is not None
def lookup_orcid(self, orcid):
@@ -464,6 +313,7 @@ class EntityImporter:
return creator_id
def is_doi(self, doi):
+ # TODO: replace with clean_doi() from fatcat_tools.normal
return doi.startswith("10.") and doi.count("/") >= 1
def lookup_doi(self, doi):
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
index 10a90dba..39927651 100644
--- a/python/fatcat_tools/normal.py
+++ b/python/fatcat_tools/normal.py
@@ -5,6 +5,13 @@ free-form input, titles, etc.
"""
import re
+import base64
+from typing import Optional
+import unicodedata
+
+import ftfy
+import langdetect
+import pycountry
DOI_REGEX = re.compile(r"^10.\d{3,6}/\S+$")
@@ -233,3 +240,318 @@ def test_clean_orcid():
assert clean_orcid("0123-4567-3456-6789 ") == "0123-4567-3456-6789"
assert clean_orcid("01234567-3456-6780") == None
assert clean_orcid("0x23-4567-3456-6780") == None
+
+
+def clean_str(thing: Optional[str], force_xml: bool = False) -> Optional[str]:
+ """
+ This function is appropriate to be called on any random, non-markup string,
+ such as author names, titles, etc.
+
+ It will try to clean up common unicode mangles, HTML characters, etc.
+
+ This will detect XML/HTML and "do the right thing" (aka, not remove
+ entities like '&amp' if there are tags in the string), unless you pass the
+ 'force_xml' parameter, which might be appropriate for, eg, names and
+ titles, which generally should be projected down to plain text.
+
+ Also strips extra whitespace.
+ """
+ if not thing:
+ return None
+ fix_entities = 'auto'
+ if force_xml:
+ fix_entities = True
+ fixed = ftfy.fix_text(thing, fix_entities=fix_entities).strip()
+ if not fixed or len(fixed) <= 1:
+ # wasn't zero-length before, but is now; return None
+ return None
+ return fixed
+
+def test_clean_str():
+
+ assert clean_str(None) == None
+ assert clean_str('') == None
+ assert clean_str('1') == None
+ assert clean_str('123') == '123'
+ assert clean_str('a&amp;b') == 'a&b'
+ assert clean_str('<b>a&amp;b</b>') == '<b>a&amp;b</b>'
+ assert clean_str('<b>a&amp;b</b>', force_xml=True) == '<b>a&b</b>'
+
+def b32_hex(s):
+ s = s.strip().split()[0].lower()
+ if s.startswith("sha1:"):
+ s = s[5:]
+ if len(s) != 32:
+ return s
+ return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+
+def is_cjk(s):
+ if not s:
+ return False
+ for c in s:
+ if c.isalpha():
+ lang_prefix = unicodedata.name(c).split()[0]
+ return lang_prefix in ('CJK', 'HIRAGANA', 'KATAKANA', 'HANGUL')
+ return False
+
+def test_is_cjk():
+ assert is_cjk(None) is False
+ assert is_cjk('') is False
+ assert is_cjk('blah') is False
+ assert is_cjk('岡, 鹿, 梨, 阜, 埼') is True
+ assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') is True
+ assert is_cjk('菊') is True
+ assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') is True
+ assert is_cjk('水道') is True
+ assert is_cjk('オウ, イク') is True # kanji
+ assert is_cjk('ひヒ') is True
+ assert is_cjk('き゚ゅ') is True
+ assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') is True
+
+MONTH_MAP = {
+ "jan": 1, "january": 1,
+ "feb": 2, "febuary": 2,
+ "mar": 3, "march": 3,
+ "apr": 4, "april": 4,
+ "may": 5, "may": 5,
+ "jun": 6, "june": 6,
+ "jul": 7, "july": 7,
+ "aug": 8, "august": 8,
+ "sep": 9, "september": 9,
+ "oct": 10, "october": 10,
+ "nov": 11, "nov": 11,
+ "dec": 12, "december": 12,
+}
+
+def parse_month(raw: Optional[str]) -> Optional[int]:
+ """
+ Parses a string into a month number (1 to 12)
+ """
+ if not raw:
+ return None
+ raw = raw.strip().lower()
+ if raw.isdigit():
+ raw_int = int(raw)
+ if raw_int >= 1 and raw_int <= 12:
+ return raw_int
+ else:
+ return None
+ if raw in MONTH_MAP:
+ return MONTH_MAP[raw]
+ return None
+
+def test_parse_month() -> None:
+
+ assert parse_month(None) == None
+ assert parse_month("") == None
+ assert parse_month("0") == None
+ assert parse_month("10") == 10
+ assert parse_month("jan") == 1
+ assert parse_month("September") == 9
+
+def detect_text_lang(raw: str) -> Optional[str]:
+ """
+ Tries to determine language of, eg, an abstract.
+
+ Returns an ISO 631 2-char language code, or None.
+ """
+ if not raw:
+ return None
+ try:
+ lang = langdetect.detect(raw)
+ assert len(lang) == 2
+ return lang
+ except (langdetect.lang_detect_exception.LangDetectException, TypeError):
+ return None
+ return None
+
+def test_detect_text_lang() -> None:
+ assert detect_text_lang("") == None
+ EN_SAMPLE = "this is a string of English text for testing"
+ assert detect_text_lang(EN_SAMPLE) == "en"
+ JA_SAMPLE = "モーラの種類は、以下に示すように111程度存在する。ただし、研究者により数え方が少しずつ異なる。"
+ assert detect_text_lang(JA_SAMPLE) == "ja"
+
+def parse_lang_name(raw: Optional[str]) -> Optional[str]:
+ """
+ Parses a language name and returns a 2-char ISO 631 language code.
+ """
+ if not raw:
+ return None
+ try:
+ lang = pycountry.languages.lookup(raw)
+ if lang.alpha_3 in ("mul", "mis"):
+ return None
+ return lang.alpha_2.lower()
+ except LookupError:
+ #print(f" unknown language: '{raw}', file=sys.stderr)
+ return None
+ except AttributeError:
+ #print(f" partial language metadata: '{lang}', file=sys.stderr)
+ return None
+ return None
+
+def test_parse_lang_name() -> None:
+
+ assert parse_lang_name(None) == None
+ assert parse_lang_name("") == None
+ assert parse_lang_name("asdf ") == None
+ assert parse_lang_name("english") == "en"
+ assert parse_lang_name("ENGLISH") == "en"
+ assert parse_lang_name("asdf blah") is None
+ assert parse_lang_name("en") == "en"
+ assert parse_lang_name("EN") == "en"
+ assert parse_lang_name("ENG") == "en"
+ assert parse_lang_name("English") == "en"
+ assert parse_lang_name("Portuguese") == "pt"
+
+
+def parse_country_name(s: Optional[str]) -> Optional[str]:
+ """
+ Parses a country name into a ISO country code (2-char).
+
+ This version copied from the chocula repository.
+ """
+ if not s or s in ("Unknown"):
+ return None
+
+ s = s.strip()
+ if s.lower() in ("usa", "new york (state)", "washington (state)"):
+ return "us"
+ if s.lower() in ("russia (federation)", "russia"):
+ return "ru"
+ if s == "Québec (Province)":
+ s = "Canada"
+ if s == "China (Republic : 1949- )":
+ return "tw"
+ if s == "Brunei":
+ return "bn"
+ if s.startswith("Congo "):
+ s = "Congo"
+ if s.lower() == "iran":
+ return "ir"
+ if s.lower() == "bermuda islands":
+ return "bm"
+ if s.lower() == "burma":
+ s = "myanmar"
+ if s.lower() in ("korea (south)", "south korea"):
+ return "kr"
+ if s.lower() in ("england", "scotland", "wales"):
+ return "uk"
+ s = s.replace(" (Republic)", "").replace(" (Federation)", "")
+
+ try:
+ country = pycountry.countries.lookup(s)
+ except LookupError:
+ country = None
+
+ if country:
+ return country.alpha_2.lower()
+ try:
+ sub = pycountry.subdivisions.lookup(s)
+ except LookupError:
+ sub = None
+
+ s = s.replace(" (State)", "").replace(" (Province)", "")
+ if sub:
+ return sub.country_code.lower()
+
+ else:
+ # print(f"unknown country: {s}", file=sys.stderr)
+ return None
+
+
+def test_parse_country_name():
+ assert parse_country_name("") is None
+ assert parse_country_name("asdf blah") is None
+ assert parse_country_name("us") == "us"
+ assert parse_country_name("USA") == "us"
+ assert parse_country_name("United States of America") == "us"
+ assert parse_country_name("united States") == "us"
+ assert parse_country_name("Massachusetts") == "us"
+ assert parse_country_name("Russia") == "ru"
+ assert parse_country_name("Japan") == "jp"
+
+# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of
+# 2/T and 2/B?
+# PubMed/MEDLINE and JSTOR use these MARC codes
+# https://www.loc.gov/marc/languages/language_name.html
+LANG_MAP_MARC = {
+ 'afr': 'af',
+ 'alb': 'sq',
+ 'amh': 'am',
+ 'ara': 'ar',
+ 'arm': 'hy',
+ 'aze': 'az',
+ 'ben': 'bn',
+ 'bos': 'bs',
+ 'bul': 'bg',
+ 'cat': 'ca',
+ 'chi': 'zh',
+ 'cze': 'cs',
+ 'dan': 'da',
+ 'dut': 'nl',
+ 'eng': 'en',
+ 'epo': 'eo',
+ 'est': 'et',
+ 'fin': 'fi',
+ 'fre': 'fr',
+ 'geo': 'ka',
+ 'ger': 'de',
+ 'gla': 'gd',
+ 'gre': 'el',
+ 'heb': 'he',
+ 'hin': 'hi',
+ 'hrv': 'hr',
+ 'hun': 'hu',
+ 'ice': 'is',
+ 'ind': 'id',
+ 'ita': 'it',
+ 'jpn': 'ja',
+ 'kin': 'rw',
+ 'kor': 'ko',
+ 'lat': 'la',
+ 'lav': 'lv',
+ 'lit': 'lt',
+ 'mac': 'mk',
+ 'mal': 'ml',
+ 'mao': 'mi',
+ 'may': 'ms',
+ 'nor': 'no',
+ 'per': 'fa',
+ 'per': 'fa',
+ 'pol': 'pl',
+ 'por': 'pt',
+ 'pus': 'ps',
+ 'rum': 'ro',
+ 'rus': 'ru',
+ 'san': 'sa',
+ 'slo': 'sk',
+ 'slv': 'sl',
+ 'spa': 'es',
+ 'srp': 'sr',
+ 'swe': 'sv',
+ 'tha': 'th',
+ 'tur': 'tr',
+ 'ukr': 'uk',
+ 'urd': 'ur',
+ 'vie': 'vi',
+ 'wel': 'cy',
+
+# additions
+ 'gle': 'ga', # "Irish" (Gaelic)
+ 'jav': 'jv', # Javanese
+ 'welsh': 'cy', # Welsh
+ 'oci': 'oc', # Occitan
+
+# Don't have ISO 639-1 codes
+ 'grc': 'el', # Ancient Greek; map to modern greek
+ 'map': None, # Austronesian (collection)
+ 'syr': None, # Syriac, Modern
+ 'gem': None, # Old Saxon
+ 'non': None, # Old Norse
+ 'emg': None, # Eastern Meohang
+ 'neg': None, # Negidal
+ 'mul': None, # Multiple languages
+ 'und': None, # Undetermined
+}