aboutsummaryrefslogtreecommitdiffstats
path: root/chocula/util.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-05-06 18:26:53 -0700
committerBryan Newbold <bnewbold@archive.org>2020-05-07 00:59:37 -0700
commit4d701f4f2ea99ac95bd4235adef1998f3abdc9f9 (patch)
tree6408d86364109765d0deb3692321ed7f3128ea05 /chocula/util.py
parentd559304babb24e4961ba13c554817730b46cfadc (diff)
downloadchocula-4d701f4f2ea99ac95bd4235adef1998f3abdc9f9.tar.gz
chocula-4d701f4f2ea99ac95bd4235adef1998f3abdc9f9.zip
start a Makefile
Move all "index" functions into classes, each in a separate file. Add lots of type annotations. Use dataclass objects to hold database rows. This aspect will need further refactoring to remove "extra" usage, probably by adding database rows to align with DatabaseInfo more closely.
Diffstat (limited to 'chocula/util.py')
-rw-r--r--chocula/util.py95
1 files changed, 43 insertions, 52 deletions
diff --git a/chocula/util.py b/chocula/util.py
index 533b41a..a8a30db 100644
--- a/chocula/util.py
+++ b/chocula/util.py
@@ -1,7 +1,8 @@
-import urlcanon
-import surt
-import tldextract
+from dataclasses import dataclass
+from typing import Dict, Optional
+
+import ftfy
import pycountry
################### Utilities
@@ -118,15 +119,6 @@ OTHER_PUBLISHERS = [
'Project MUSE',
]
-def unquote(s):
- if s.startswith('"'):
- s = s[1:]
- if s.endswith('"'):
- s = s[:-1]
- if s.endswith('.'):
- s = s[:-1]
- return s.strip()
-
def parse_lang(s):
if not s or s in ('Not applicable', 'Multiple languages', 'Unknown'):
return None
@@ -260,52 +252,51 @@ def test_merge_spans():
[[1450, 1900], [2000, 2000]]
-def parse_url(url):
+def unquote(s: str) -> str:
+ if s.startswith('"') or s.startswith("'"):
+ s = s[1:]
+ if s.endswith('"') or s.endswith("'"):
+ s = s[:-1]
+ if s.endswith('.'):
+ s = s[:-1]
+ return s.strip()
+
+
+def clean_str(s: Optional[str]) -> Optional[str]:
"""
- Parses/cleans URLs.
+ Takes a generic string and "cleans" it:
- Returns a dict with:
-
- url: str, cleaned/normalized URL
- url_surt: str, "sortable url" (a web-archiving format)
- host: str, full hostname
- registered_domain: "primary domain", eg "google.com" or "thing.co.uk"
- suffix: str, eg "com" or "co.uk"
+ - strips whitespace
+ - de-mangles unicode
+ - strips HTML tags
+ - transforms HTML entities to unicode characters
+ - removes leading and trailing
- Returns None if url is really bad (not a URL).
+ This version of the function is pretty aggressive; it is intended for
+ journal titles, publisher names, etc, not things like article titles.
"""
- if not url or 'mailto:' in url.lower() or url.lower() in ('http://n/a', 'http://na/', 'http://na'):
+ if not s:
return None
- if url.startswith('www.'):
- url = "http://" + url
- if url.startswith('ttp://') or url.startswith('ttps://'):
- url = "h" + url
- url.replace('Http://', 'http://')
+ s = unquote(ftfy.fix_text(s))
+ return s or None
- url = str(urlcanon.semantic_precise(url))
- if url == 'http://na/':
- # sort of redundant with above, but some only match after canonicalization
- return None
- url_surt = surt.surt(url)
- tld = tldextract.extract(url)
- host = '.'.join(tld)
- if host.startswith('.'):
- host = host[1:]
- return dict(url=url,
- url_surt=url_surt or None,
- host=host or None,
- registered_domain=tld.registered_domain or None,
- suffix=tld.suffix or None)
+def test_clean_str():
+ assert clean_str("") is None
+ assert clean_str(" ") is None
+ assert clean_str("" "") is None
+ assert clean_str(" Bloody work.") == "Bloody work"
-def test_parse_url():
-
- assert parse_url("http://thing.core.ac.uk")['registered_domain'] == 'core.ac.uk'
- assert parse_url("http://thing.core.ac.uk")['host'] == 'thing.core.ac.uk'
- assert parse_url("http://thing.core.ac.uk")['suffix'] == 'ac.uk'
- assert parse_url("google.com")['suffix'] == 'com'
- assert parse_url("google.com")['host'] == 'google.com'
- assert parse_url("mailto:bnewbold@bogus.com") == None
- assert parse_url("thing.com")['url'] == 'http://thing.com/'
- assert parse_url("Http://thing.com///")['url'] == 'http://thing.com/'
+def clean_issn(s: str) -> Optional[str]:
+ s = s.strip().upper()
+ if len(s) == 8:
+ s = s[:4] + "-" + s[4:]
+ if len(s) != 9 or s[4] != "-":
+ return None
+ return s
+
+def test_clean_issn():
+ assert clean_issn("1234-5678") == "1234-5678"
+ assert clean_issn(" 12345678") == "1234-5678"
+ assert clean_issn("123445678") == None