From 4d701f4f2ea99ac95bd4235adef1998f3abdc9f9 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Wed, 6 May 2020 18:26:53 -0700
Subject: start a Makefile

Move all "index" functions into classes, each in a separate file.

Add lots of type annotations.

Use dataclass objects to hold database rows. This aspect will need
further refactoring to remove "extra" usage, probably by adding database
rows to align with DatabaseInfo more closely.
---
 chocula/util.py | 95 ++++++++++++++++++++++++++-------------------------------
 1 file changed, 43 insertions(+), 52 deletions(-)

(limited to 'chocula/util.py')

diff --git a/chocula/util.py b/chocula/util.py
index 533b41a..a8a30db 100644
--- a/chocula/util.py
+++ b/chocula/util.py
@@ -1,7 +1,8 @@
 
-import urlcanon
-import surt
-import tldextract
+from dataclasses import dataclass
+from typing import Dict, Optional
+
+import ftfy
 import pycountry
 
 ################### Utilities
@@ -118,15 +119,6 @@ OTHER_PUBLISHERS = [
     'Project MUSE',
 ]
 
-def unquote(s):
-    if s.startswith('"'):
-        s = s[1:]
-    if s.endswith('"'):
-        s = s[:-1]
-    if s.endswith('.'):
-        s = s[:-1]
-    return s.strip()
-
 def parse_lang(s):
     if not s or s in ('Not applicable', 'Multiple languages', 'Unknown'):
         return None
@@ -260,52 +252,51 @@ def test_merge_spans():
         [[1450, 1900], [2000, 2000]]
 
 
-def parse_url(url):
+def unquote(s: str) -> str:
+    if s.startswith('"') or s.startswith("'"):
+        s = s[1:]
+    if s.endswith('"') or s.endswith("'"):
+        s = s[:-1]
+    if s.endswith('.'):
+        s = s[:-1]
+    return s.strip()
+
+
+def clean_str(s: Optional[str]) -> Optional[str]:
     """
-    Parses/cleans URLs.
+    Takes a generic string and "cleans" it:
 
-    Returns a dict with:
-        
-        url: str, cleaned/normalized URL
-        url_surt: str, "sortable url" (a web-archiving format)
-        host: str, full hostname
-        registered_domain: "primary domain", eg "google.com" or "thing.co.uk"
-        suffix: str, eg "com" or "co.uk"
+    - strips whitespace
+    - de-mangles unicode
+    - strips HTML tags
+    - transforms HTML entities to unicode characters
+    - removes leading and trailing
 
-    Returns None if url is really bad (not a URL).
+    This version of the function is pretty aggressive; it is intended for
+    journal titles, publisher names, etc, not things like article titles.
     """
-    if not url or 'mailto:' in url.lower() or url.lower() in ('http://n/a', 'http://na/', 'http://na'):
+    if not s:
         return None
-    if url.startswith('www.'):
-        url = "http://" + url
-    if url.startswith('ttp://') or url.startswith('ttps://'):
-        url = "h" + url
-    url.replace('Http://', 'http://')
+    s = unquote(ftfy.fix_text(s))
+    return s or None
 
-    url = str(urlcanon.semantic_precise(url))
-    if url == 'http://na/':
-        # sort of redundant with above, but some only match after canonicalization
-        return None
-    url_surt = surt.surt(url)
-    tld = tldextract.extract(url)
-    host = '.'.join(tld)
-    if host.startswith('.'):
-        host = host[1:]
-    return dict(url=url,
-                url_surt=url_surt or None,
-                host=host or None,
-                registered_domain=tld.registered_domain or None,
-                suffix=tld.suffix or None)
+def test_clean_str():
+    assert clean_str("") is None
+    assert clean_str(" ") is None
+    assert clean_str("" "") is None
+    assert clean_str(" Bloody work.") == "Bloody work"
 
-def test_parse_url():
-    
-    assert parse_url("http://thing.core.ac.uk")['registered_domain'] == 'core.ac.uk'
-    assert parse_url("http://thing.core.ac.uk")['host'] == 'thing.core.ac.uk'
-    assert parse_url("http://thing.core.ac.uk")['suffix'] == 'ac.uk'
 
-    assert parse_url("google.com")['suffix'] == 'com'
-    assert parse_url("google.com")['host'] == 'google.com'
 
-    assert parse_url("mailto:bnewbold@bogus.com") == None
-    assert parse_url("thing.com")['url'] == 'http://thing.com/'
-    assert parse_url("Http://thing.com///")['url'] == 'http://thing.com/'
+def clean_issn(s: str) -> Optional[str]:
+    s = s.strip().upper()
+    if len(s) == 8:
+        s = s[:4] + "-" + s[4:]
+    if len(s) != 9 or s[4] != "-":
+        return None
+    return s
+
+def test_clean_issn():
+    assert clean_issn("1234-5678") == "1234-5678"
+    assert clean_issn(" 12345678") == "1234-5678"
+    assert clean_issn("123445678") == None
-- 
cgit v1.2.3