aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-11-30 14:20:11 -0800
committerBryan Newbold <bnewbold@archive.org>2021-11-30 14:20:52 -0800
commit9542ab3ea9145e937e412bb707d96ab031b13e31 (patch)
tree11fb6e8e1286050142c287a97a12ab833188bb47
parenta13787eaa6738e5f2ffb29d1d4d9a83617a1b943 (diff)
downloadchocula-9542ab3ea9145e937e412bb707d96ab031b13e31.tar.gz
chocula-9542ab3ea9145e937e412bb707d96ab031b13e31.zip
improve homepage URL filtering
-rw-r--r--chocula/database.py42
1 files changed, 28 insertions, 14 deletions
diff --git a/chocula/database.py b/chocula/database.py
index 1982108..9d7bfb1 100644
--- a/chocula/database.py
+++ b/chocula/database.py
@@ -44,12 +44,19 @@ class HomepageUrl:
"""
Returns None if url is really bad (not a URL).
"""
+ if not url:
+ return None
+ if url.startswith("www."):
+ url = "http://" + url
+ if url.startswith("ttp://") or url.startswith("ttps://"):
+ url = "h" + url
+ url.replace("Http://", "http://")
+ url = str(urlcanon.semantic_precise(url))
if (
not url
- or '://' not in url
- or not url.lower().startswith('http')
+ or "://" not in url
+ or not url.lower().startswith("http")
or "mailto:" in url.lower()
- or url.lower() in ("http://n/a", "http://na/", "http://na")
or "LOCKSS_RESOLVER" in url
or "$result.AccessURL" in url
or "://firstsearch.oclc.org" in url
@@ -62,21 +69,17 @@ class HomepageUrl:
or "://doaj.org" in url
):
return None
- if url.startswith("www."):
- url = "http://" + url
- if url.startswith("ttp://") or url.startswith("ttps://"):
- url = "h" + url
- url.replace("Http://", "http://")
- url = str(urlcanon.semantic_precise(url))
- if url == "http://na/":
- # sort of redundant with above, but some only match after canonicalization
- return None
- url_surt = surt.surt(url)
tld = tldextract.extract(url)
host = ".".join(tld)
if host.startswith("."):
host = host[1:]
+ if not (tld.registered_domain and tld.suffix):
+ return None
+ try:
+ url_surt = surt.surt(url)
+ except ValueError:
+ return None
return HomepageUrl(
url=url,
surt=url_surt,
@@ -95,10 +98,21 @@ def test_from_url():
assert HomepageUrl.from_url("google.com").suffix == "com"
assert HomepageUrl.from_url("google.com").host == "google.com"
- assert HomepageUrl.from_url("mailto:bnewbold@bogus.com") == None
+ assert HomepageUrl.from_url("mailto:bnewbold@bogus.com") is None
assert HomepageUrl.from_url("thing.com").url == "http://thing.com/"
assert HomepageUrl.from_url("Http://thing.com///").url == "http://thing.com/"
+ assert HomepageUrl.from_url("http://na") is None
+ assert HomepageUrl.from_url("http://n/a") is None
+ assert HomepageUrl.from_url("http:///???") is None
+ assert (
+ HomepageUrl.from_url("https://jurnal.stiemuarateweh.ac.id:443ojs/index.php/JSM")
+ is None
+ )
+ assert HomepageUrl.from_url("") is None
+ assert HomepageUrl.from_url("https://") is None
+ assert HomepageUrl.from_url("https://:80/thing.pdf") is None
+
@dataclass
class UrlCrawlStatus: