aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/misc.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-03-23 10:32:47 -0700
committerBryan Newbold <bnewbold@archive.org>2020-03-23 10:32:50 -0700
commit84eeefbd3c55ea31bcf552f9c129c0e1576717ae (patch)
treea66a381fc9535ff3ed6b3760c573b4476e3ab043 /python/sandcrawler/misc.py
parente5ad7bddbcb55471b96ce30397ed85fe98e3b098 (diff)
downloadsandcrawler-84eeefbd3c55ea31bcf552f9c129c0e1576717ae.tar.gz
sandcrawler-84eeefbd3c55ea31bcf552f9c129c0e1576717ae.zip
ingest: clean_url() in more places
Some 'cdx-error' results were due to URLs with ':' after the hostname or trailing newline ("\n") characters in the URL. This attempts to work around this categroy of error.
Diffstat (limited to 'python/sandcrawler/misc.py')
-rw-r--r--python/sandcrawler/misc.py1
1 files changed, 1 insertions, 0 deletions
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index d9c9d55..1b8aa92 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -10,6 +10,7 @@ import urlcanon
def clean_url(s):
+ s = s.strip()
parsed = urlcanon.parse_url(s)
if not parsed.port and parsed.colon_before_port:
parsed.colon_before_port = b''