diff options
Diffstat (limited to 'chocula/util.py')
-rw-r--r-- | chocula/util.py | 95 |
1 files changed, 43 insertions, 52 deletions
diff --git a/chocula/util.py b/chocula/util.py index 533b41a..a8a30db 100644 --- a/chocula/util.py +++ b/chocula/util.py @@ -1,7 +1,8 @@ -import urlcanon -import surt -import tldextract +from dataclasses import dataclass +from typing import Dict, Optional + +import ftfy import pycountry ################### Utilities @@ -118,15 +119,6 @@ OTHER_PUBLISHERS = [ 'Project MUSE', ] -def unquote(s): - if s.startswith('"'): - s = s[1:] - if s.endswith('"'): - s = s[:-1] - if s.endswith('.'): - s = s[:-1] - return s.strip() - def parse_lang(s): if not s or s in ('Not applicable', 'Multiple languages', 'Unknown'): return None @@ -260,52 +252,51 @@ def test_merge_spans(): [[1450, 1900], [2000, 2000]] -def parse_url(url): +def unquote(s: str) -> str: + if s.startswith('"') or s.startswith("'"): + s = s[1:] + if s.endswith('"') or s.endswith("'"): + s = s[:-1] + if s.endswith('.'): + s = s[:-1] + return s.strip() + + +def clean_str(s: Optional[str]) -> Optional[str]: """ - Parses/cleans URLs. + Takes a generic string and "cleans" it: - Returns a dict with: - - url: str, cleaned/normalized URL - url_surt: str, "sortable url" (a web-archiving format) - host: str, full hostname - registered_domain: "primary domain", eg "google.com" or "thing.co.uk" - suffix: str, eg "com" or "co.uk" + - strips whitespace + - de-mangles unicode + - strips HTML tags + - transforms HTML entities to unicode characters + - removes leading and trailing - Returns None if url is really bad (not a URL). + This version of the function is pretty aggressive; it is intended for + journal titles, publisher names, etc, not things like article titles. """ - if not url or 'mailto:' in url.lower() or url.lower() in ('http://n/a', 'http://na/', 'http://na'): + if not s: return None - if url.startswith('www.'): - url = "http://" + url - if url.startswith('ttp://') or url.startswith('ttps://'): - url = "h" + url - url.replace('Http://', 'http://') + s = unquote(ftfy.fix_text(s)) + return s or None - url = str(urlcanon.semantic_precise(url)) - if url == 'http://na/': - # sort of redundant with above, but some only match after canonicalization - return None - url_surt = surt.surt(url) - tld = tldextract.extract(url) - host = '.'.join(tld) - if host.startswith('.'): - host = host[1:] - return dict(url=url, - url_surt=url_surt or None, - host=host or None, - registered_domain=tld.registered_domain or None, - suffix=tld.suffix or None) +def test_clean_str(): + assert clean_str("") is None + assert clean_str(" ") is None + assert clean_str("" "") is None + assert clean_str(" Bloody work.") == "Bloody work" -def test_parse_url(): - - assert parse_url("http://thing.core.ac.uk")['registered_domain'] == 'core.ac.uk' - assert parse_url("http://thing.core.ac.uk")['host'] == 'thing.core.ac.uk' - assert parse_url("http://thing.core.ac.uk")['suffix'] == 'ac.uk' - assert parse_url("google.com")['suffix'] == 'com' - assert parse_url("google.com")['host'] == 'google.com' - assert parse_url("mailto:bnewbold@bogus.com") == None - assert parse_url("thing.com")['url'] == 'http://thing.com/' - assert parse_url("Http://thing.com///")['url'] == 'http://thing.com/' +def clean_issn(s: str) -> Optional[str]: + s = s.strip().upper() + if len(s) == 8: + s = s[:4] + "-" + s[4:] + if len(s) != 9 or s[4] != "-": + return None + return s + +def test_clean_issn(): + assert clean_issn("1234-5678") == "1234-5678" + assert clean_issn(" 12345678") == "1234-5678" + assert clean_issn("123445678") == None |