From 84eeefbd3c55ea31bcf552f9c129c0e1576717ae Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 23 Mar 2020 10:32:47 -0700 Subject: ingest: clean_url() in more places Some 'cdx-error' results were due to URLs with ':' after the hostname or trailing newline ("\n") characters in the URL. This attempts to work around this categroy of error. --- python/sandcrawler/misc.py | 1 + 1 file changed, 1 insertion(+) (limited to 'python/sandcrawler/misc.py') diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py index d9c9d55..1b8aa92 100644 --- a/python/sandcrawler/misc.py +++ b/python/sandcrawler/misc.py @@ -10,6 +10,7 @@ import urlcanon def clean_url(s): + s = s.strip() parsed = urlcanon.parse_url(s) if not parsed.port and parsed.colon_before_port: parsed.colon_before_port = b'' -- cgit v1.2.3