aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/misc.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/misc.py')
-rw-r--r--python/sandcrawler/misc.py7
1 files changed, 7 insertions, 0 deletions
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index 88669e6..d9c9d55 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -6,8 +6,15 @@ import datetime
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
+import urlcanon
+def clean_url(s):
+ parsed = urlcanon.parse_url(s)
+ if not parsed.port and parsed.colon_before_port:
+ parsed.colon_before_port = b''
+ return str(urlcanon.whatwg(parsed))
+
def gen_file_metadata(blob):
"""
Takes a file blob (bytestream) and returns hashes and other metadata.