aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/fatcat_tools/importers/common.py158
-rw-r--r--python/fatcat_tools/normal.py322
2 files changed, 326 insertions, 154 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py
index 14415683..3c810391 100644
--- a/python/fatcat_tools/importers/common.py
+++ b/python/fatcat_tools/importers/common.py
@@ -3,12 +3,9 @@ import re
import sys
import csv
import json
-import ftfy
-import base64
import sqlite3
import datetime
import subprocess
-import unicodedata
from collections import Counter
from confluent_kafka import Consumer, KafkaException
import xml.etree.ElementTree as ET
@@ -18,162 +15,13 @@ from bs4 import BeautifulSoup
import fatcat_openapi_client
from fatcat_openapi_client.rest import ApiException
+# TODO: refactor so remove need for this (re-imports for backwards compatibility)
+from fatcat_tools.normal import (clean_str as clean, is_cjk, b32_hex, LANG_MAP_MARC) # noqa: F401
DATE_FMT = "%Y-%m-%d"
SANE_MAX_RELEASES = 200
SANE_MAX_URLS = 100
-# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of
-# 2/T and 2/B?
-# PubMed/MEDLINE and JSTOR use these MARC codes
-# https://www.loc.gov/marc/languages/language_name.html
-LANG_MAP_MARC = {
- 'afr': 'af',
- 'alb': 'sq',
- 'amh': 'am',
- 'ara': 'ar',
- 'arm': 'hy',
- 'aze': 'az',
- 'ben': 'bn',
- 'bos': 'bs',
- 'bul': 'bg',
- 'cat': 'ca',
- 'chi': 'zh',
- 'cze': 'cs',
- 'dan': 'da',
- 'dut': 'nl',
- 'eng': 'en',
- 'epo': 'eo',
- 'est': 'et',
- 'fin': 'fi',
- 'fre': 'fr',
- 'geo': 'ka',
- 'ger': 'de',
- 'gla': 'gd',
- 'gre': 'el',
- 'heb': 'he',
- 'hin': 'hi',
- 'hrv': 'hr',
- 'hun': 'hu',
- 'ice': 'is',
- 'ind': 'id',
- 'ita': 'it',
- 'jpn': 'ja',
- 'kin': 'rw',
- 'kor': 'ko',
- 'lat': 'la',
- 'lav': 'lv',
- 'lit': 'lt',
- 'mac': 'mk',
- 'mal': 'ml',
- 'mao': 'mi',
- 'may': 'ms',
- 'nor': 'no',
- 'per': 'fa',
- 'per': 'fa',
- 'pol': 'pl',
- 'por': 'pt',
- 'pus': 'ps',
- 'rum': 'ro',
- 'rus': 'ru',
- 'san': 'sa',
- 'slo': 'sk',
- 'slv': 'sl',
- 'spa': 'es',
- 'srp': 'sr',
- 'swe': 'sv',
- 'tha': 'th',
- 'tur': 'tr',
- 'ukr': 'uk',
- 'urd': 'ur',
- 'vie': 'vi',
- 'wel': 'cy',
-
-# additions
- 'gle': 'ga', # "Irish" (Gaelic)
- 'jav': 'jv', # Javanese
- 'welsh': 'cy', # Welsh
- 'oci': 'oc', # Occitan
-
-# Don't have ISO 639-1 codes
- 'grc': 'el', # Ancient Greek; map to modern greek
- 'map': None, # Austronesian (collection)
- 'syr': None, # Syriac, Modern
- 'gem': None, # Old Saxon
- 'non': None, # Old Norse
- 'emg': None, # Eastern Meohang
- 'neg': None, # Negidal
- 'mul': None, # Multiple languages
- 'und': None, # Undetermined
-}
-
-
-def clean(thing, force_xml=False):
- """
- This function is appropriate to be called on any random, non-markup string,
- such as author names, titles, etc.
-
- It will try to clean up common unicode mangles, HTML characters, etc.
-
- This will detect XML/HTML and "do the right thing" (aka, not remove
- entities like '&amp' if there are tags in the string), unless you pass the
- 'force_xml' parameter, which might be appropriate for, eg, names and
- titles, which generally should be projected down to plain text.
-
- Also strips extra whitespace.
- """
- if not thing:
- return None
- fix_entities = 'auto'
- if force_xml:
- fix_entities = True
- fixed = ftfy.fix_text(thing, fix_entities=fix_entities).strip()
- if not fixed or len(fixed) <= 1:
- # wasn't zero-length before, but is now; return None
- return None
- return fixed
-
-def test_clean():
-
- assert clean(None) == None
- assert clean('') == None
- assert clean('1') == None
- assert clean('123') == '123'
- assert clean('a&amp;b') == 'a&b'
- assert clean('<b>a&amp;b</b>') == '<b>a&amp;b</b>'
- assert clean('<b>a&amp;b</b>', force_xml=True) == '<b>a&b</b>'
-
-def b32_hex(s):
- s = s.strip().split()[0].lower()
- if s.startswith("sha1:"):
- s = s[5:]
- if len(s) != 32:
- return s
- return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
-
-def is_cjk(s):
- if not s:
- return False
- for c in s:
- if c.isalpha():
- lang_prefix = unicodedata.name(c).split()[0]
- return lang_prefix in ('CJK', 'HIRAGANA', 'KATAKANA', 'HANGUL')
- return False
-
-def test_is_cjk():
- assert is_cjk(None) is False
- assert is_cjk('') is False
- assert is_cjk('blah') is False
- assert is_cjk('岡, 鹿, 梨, 阜, 埼') is True
- assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') is True
- assert is_cjk('菊') is True
- assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') is True
- assert is_cjk('水道') is True
- assert is_cjk('オウ, イク') is True # kanji
- assert is_cjk('ひヒ') is True
- assert is_cjk('き゚ゅ') is True
- assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') is True
-
DOMAIN_REL_MAP = {
"archive.org": "archive",
# LOCKSS, Portico, DuraSpace, etc would also be "archive"
@@ -444,6 +292,7 @@ class EntityImporter:
raise NotImplementedError
def is_orcid(self, orcid):
+ # TODO: replace with clean_orcid() from fatcat_tools.normal
return self._orcid_regex.match(orcid) is not None
def lookup_orcid(self, orcid):
@@ -464,6 +313,7 @@ class EntityImporter:
return creator_id
def is_doi(self, doi):
+ # TODO: replace with clean_doi() from fatcat_tools.normal
return doi.startswith("10.") and doi.count("/") >= 1
def lookup_doi(self, doi):
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
index 10a90dba..39927651 100644
--- a/python/fatcat_tools/normal.py
+++ b/python/fatcat_tools/normal.py
@@ -5,6 +5,13 @@ free-form input, titles, etc.
"""
import re
+import base64
+from typing import Optional
+import unicodedata
+
+import ftfy
+import langdetect
+import pycountry
DOI_REGEX = re.compile(r"^10.\d{3,6}/\S+$")
@@ -233,3 +240,318 @@ def test_clean_orcid():
assert clean_orcid("0123-4567-3456-6789 ") == "0123-4567-3456-6789"
assert clean_orcid("01234567-3456-6780") == None
assert clean_orcid("0x23-4567-3456-6780") == None
+
+
+def clean_str(thing: Optional[str], force_xml: bool = False) -> Optional[str]:
+ """
+ This function is appropriate to be called on any random, non-markup string,
+ such as author names, titles, etc.
+
+ It will try to clean up common unicode mangles, HTML characters, etc.
+
+ This will detect XML/HTML and "do the right thing" (aka, not remove
+ entities like '&amp' if there are tags in the string), unless you pass the
+ 'force_xml' parameter, which might be appropriate for, eg, names and
+ titles, which generally should be projected down to plain text.
+
+ Also strips extra whitespace.
+ """
+ if not thing:
+ return None
+ fix_entities = 'auto'
+ if force_xml:
+ fix_entities = True
+ fixed = ftfy.fix_text(thing, fix_entities=fix_entities).strip()
+ if not fixed or len(fixed) <= 1:
+ # wasn't zero-length before, but is now; return None
+ return None
+ return fixed
+
+def test_clean_str():
+
+ assert clean_str(None) == None
+ assert clean_str('') == None
+ assert clean_str('1') == None
+ assert clean_str('123') == '123'
+ assert clean_str('a&amp;b') == 'a&b'
+ assert clean_str('<b>a&amp;b</b>') == '<b>a&amp;b</b>'
+ assert clean_str('<b>a&amp;b</b>', force_xml=True) == '<b>a&b</b>'
+
+def b32_hex(s):
+ s = s.strip().split()[0].lower()
+ if s.startswith("sha1:"):
+ s = s[5:]
+ if len(s) != 32:
+ return s
+ return base64.b16encode(base64.b32decode(s.upper())).lower().decode('utf-8')
+
+def is_cjk(s):
+ if not s:
+ return False
+ for c in s:
+ if c.isalpha():
+ lang_prefix = unicodedata.name(c).split()[0]
+ return lang_prefix in ('CJK', 'HIRAGANA', 'KATAKANA', 'HANGUL')
+ return False
+
+def test_is_cjk():
+ assert is_cjk(None) is False
+ assert is_cjk('') is False
+ assert is_cjk('blah') is False
+ assert is_cjk('岡, 鹿, 梨, 阜, 埼') is True
+ assert is_cjk('[岡, 鹿, 梨, 阜, 埼]') is True
+ assert is_cjk('菊') is True
+ assert is_cjk('岡, 鹿, 梨, 阜, 埼 with eng after') is True
+ assert is_cjk('水道') is True
+ assert is_cjk('オウ, イク') is True # kanji
+ assert is_cjk('ひヒ') is True
+ assert is_cjk('き゚ゅ') is True
+ assert is_cjk('ㄴ, ㄹ, ㅁ, ㅂ, ㅅ') is True
+
+MONTH_MAP = {
+ "jan": 1, "january": 1,
+ "feb": 2, "febuary": 2,
+ "mar": 3, "march": 3,
+ "apr": 4, "april": 4,
+ "may": 5, "may": 5,
+ "jun": 6, "june": 6,
+ "jul": 7, "july": 7,
+ "aug": 8, "august": 8,
+ "sep": 9, "september": 9,
+ "oct": 10, "october": 10,
+ "nov": 11, "nov": 11,
+ "dec": 12, "december": 12,
+}
+
+def parse_month(raw: Optional[str]) -> Optional[int]:
+ """
+ Parses a string into a month number (1 to 12)
+ """
+ if not raw:
+ return None
+ raw = raw.strip().lower()
+ if raw.isdigit():
+ raw_int = int(raw)
+ if raw_int >= 1 and raw_int <= 12:
+ return raw_int
+ else:
+ return None
+ if raw in MONTH_MAP:
+ return MONTH_MAP[raw]
+ return None
+
+def test_parse_month() -> None:
+
+ assert parse_month(None) == None
+ assert parse_month("") == None
+ assert parse_month("0") == None
+ assert parse_month("10") == 10
+ assert parse_month("jan") == 1
+ assert parse_month("September") == 9
+
+def detect_text_lang(raw: str) -> Optional[str]:
+ """
+ Tries to determine language of, eg, an abstract.
+
+ Returns an ISO 631 2-char language code, or None.
+ """
+ if not raw:
+ return None
+ try:
+ lang = langdetect.detect(raw)
+ assert len(lang) == 2
+ return lang
+ except (langdetect.lang_detect_exception.LangDetectException, TypeError):
+ return None
+ return None
+
+def test_detect_text_lang() -> None:
+ assert detect_text_lang("") == None
+ EN_SAMPLE = "this is a string of English text for testing"
+ assert detect_text_lang(EN_SAMPLE) == "en"
+ JA_SAMPLE = "モーラの種類は、以下に示すように111程度存在する。ただし、研究者により数え方が少しずつ異なる。"
+ assert detect_text_lang(JA_SAMPLE) == "ja"
+
+def parse_lang_name(raw: Optional[str]) -> Optional[str]:
+ """
+ Parses a language name and returns a 2-char ISO 631 language code.
+ """
+ if not raw:
+ return None
+ try:
+ lang = pycountry.languages.lookup(raw)
+ if lang.alpha_3 in ("mul", "mis"):
+ return None
+ return lang.alpha_2.lower()
+ except LookupError:
+ #print(f" unknown language: '{raw}', file=sys.stderr)
+ return None
+ except AttributeError:
+ #print(f" partial language metadata: '{lang}', file=sys.stderr)
+ return None
+ return None
+
+def test_parse_lang_name() -> None:
+
+ assert parse_lang_name(None) == None
+ assert parse_lang_name("") == None
+ assert parse_lang_name("asdf ") == None
+ assert parse_lang_name("english") == "en"
+ assert parse_lang_name("ENGLISH") == "en"
+ assert parse_lang_name("asdf blah") is None
+ assert parse_lang_name("en") == "en"
+ assert parse_lang_name("EN") == "en"
+ assert parse_lang_name("ENG") == "en"
+ assert parse_lang_name("English") == "en"
+ assert parse_lang_name("Portuguese") == "pt"
+
+
+def parse_country_name(s: Optional[str]) -> Optional[str]:
+ """
+ Parses a country name into a ISO country code (2-char).
+
+ This version copied from the chocula repository.
+ """
+ if not s or s in ("Unknown"):
+ return None
+
+ s = s.strip()
+ if s.lower() in ("usa", "new york (state)", "washington (state)"):
+ return "us"
+ if s.lower() in ("russia (federation)", "russia"):
+ return "ru"
+ if s == "Québec (Province)":
+ s = "Canada"
+ if s == "China (Republic : 1949- )":
+ return "tw"
+ if s == "Brunei":
+ return "bn"
+ if s.startswith("Congo "):
+ s = "Congo"
+ if s.lower() == "iran":
+ return "ir"
+ if s.lower() == "bermuda islands":
+ return "bm"
+ if s.lower() == "burma":
+ s = "myanmar"
+ if s.lower() in ("korea (south)", "south korea"):
+ return "kr"
+ if s.lower() in ("england", "scotland", "wales"):
+ return "uk"
+ s = s.replace(" (Republic)", "").replace(" (Federation)", "")
+
+ try:
+ country = pycountry.countries.lookup(s)
+ except LookupError:
+ country = None
+
+ if country:
+ return country.alpha_2.lower()
+ try:
+ sub = pycountry.subdivisions.lookup(s)
+ except LookupError:
+ sub = None
+
+ s = s.replace(" (State)", "").replace(" (Province)", "")
+ if sub:
+ return sub.country_code.lower()
+
+ else:
+ # print(f"unknown country: {s}", file=sys.stderr)
+ return None
+
+
+def test_parse_country_name():
+ assert parse_country_name("") is None
+ assert parse_country_name("asdf blah") is None
+ assert parse_country_name("us") == "us"
+ assert parse_country_name("USA") == "us"
+ assert parse_country_name("United States of America") == "us"
+ assert parse_country_name("united States") == "us"
+ assert parse_country_name("Massachusetts") == "us"
+ assert parse_country_name("Russia") == "ru"
+ assert parse_country_name("Japan") == "jp"
+
+# These are very close, but maybe not exactly 1-to-1 with 639-2? Some mix of
+# 2/T and 2/B?
+# PubMed/MEDLINE and JSTOR use these MARC codes
+# https://www.loc.gov/marc/languages/language_name.html
+LANG_MAP_MARC = {
+ 'afr': 'af',
+ 'alb': 'sq',
+ 'amh': 'am',
+ 'ara': 'ar',
+ 'arm': 'hy',
+ 'aze': 'az',
+ 'ben': 'bn',
+ 'bos': 'bs',
+ 'bul': 'bg',
+ 'cat': 'ca',
+ 'chi': 'zh',
+ 'cze': 'cs',
+ 'dan': 'da',
+ 'dut': 'nl',
+ 'eng': 'en',
+ 'epo': 'eo',
+ 'est': 'et',
+ 'fin': 'fi',
+ 'fre': 'fr',
+ 'geo': 'ka',
+ 'ger': 'de',
+ 'gla': 'gd',
+ 'gre': 'el',
+ 'heb': 'he',
+ 'hin': 'hi',
+ 'hrv': 'hr',
+ 'hun': 'hu',
+ 'ice': 'is',
+ 'ind': 'id',
+ 'ita': 'it',
+ 'jpn': 'ja',
+ 'kin': 'rw',
+ 'kor': 'ko',
+ 'lat': 'la',
+ 'lav': 'lv',
+ 'lit': 'lt',
+ 'mac': 'mk',
+ 'mal': 'ml',
+ 'mao': 'mi',
+ 'may': 'ms',
+ 'nor': 'no',
+ 'per': 'fa',
+ 'per': 'fa',
+ 'pol': 'pl',
+ 'por': 'pt',
+ 'pus': 'ps',
+ 'rum': 'ro',
+ 'rus': 'ru',
+ 'san': 'sa',
+ 'slo': 'sk',
+ 'slv': 'sl',
+ 'spa': 'es',
+ 'srp': 'sr',
+ 'swe': 'sv',
+ 'tha': 'th',
+ 'tur': 'tr',
+ 'ukr': 'uk',
+ 'urd': 'ur',
+ 'vie': 'vi',
+ 'wel': 'cy',
+
+# additions
+ 'gle': 'ga', # "Irish" (Gaelic)
+ 'jav': 'jv', # Javanese
+ 'welsh': 'cy', # Welsh
+ 'oci': 'oc', # Occitan
+
+# Don't have ISO 639-1 codes
+ 'grc': 'el', # Ancient Greek; map to modern greek
+ 'map': None, # Austronesian (collection)
+ 'syr': None, # Syriac, Modern
+ 'gem': None, # Old Saxon
+ 'non': None, # Old Norse
+ 'emg': None, # Eastern Meohang
+ 'neg': None, # Negidal
+ 'mul': None, # Multiple languages
+ 'und': None, # Undetermined
+}