ingest: clean_url() in more places

Some 'cdx-error' results were due to URLs with ':' after the hostname or trailing newline ("\n") characters in the URL. This attempts to work around this categroy of error.
author: Bryan Newbold <bnewbold@archive.org> 2020-03-23 10:32:47 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-03-23 10:32:50 -0700
commit: 84eeefbd3c55ea31bcf552f9c129c0e1576717ae (patch)
tree: a66a381fc9535ff3ed6b3760c573b4476e3ab043
parent: e5ad7bddbcb55471b96ce30397ed85fe98e3b098 (diff)
download: sandcrawler-84eeefbd3c55ea31bcf552f9c129c0e1576717ae.tar.gz
sandcrawler-84eeefbd3c55ea31bcf552f9c129c0e1576717ae.zip
3 files changed, 6 insertions, 1 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 25697be..cc176d0 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -18,7 +18,7 @@ from http.client import IncompleteRead
 from wayback.resourcestore import ResourceStore
 from gwb.loader import CDXLoaderFactory
 
-from .misc import b32_hex, requests_retry_session, gen_file_metadata
+from .misc import b32_hex, requests_retry_session, gen_file_metadata, clean_url
 
 class SandcrawlerBackoffError(Exception):
     """
@@ -543,6 +543,7 @@ class WaybackClient:
         if redirect_url and redirect_url.startswith("https://web.archive.org/web/"):
             redirect_url = "/".join(redirect_url.split("/")[5:])
         #print(redirect_url, file=sys.stderr)
+        redirect_url = clean_url(redirect_url)
         if redirect_url and redirect_url.startswith("http"):
             return redirect_url
         else:
@@ -666,11 +667,13 @@ class WaybackClient:
                         next_url = domain_prefix + resource.location
                     else:
                         next_url = resource.location
+                    next_url = clean_url(next_url)
                 else:
                     next_url = self.fetch_replay_redirect(
                         url=cdx_row.url,
                         datetime=cdx_row.datetime,
                     )
+                    next_url = clean_url(next_url)
                     cdx_row = cdx_partial_from_row(cdx_row)
                     if not next_url:
                         print("bad redirect record: {}".format(cdx_row), file=sys.stderr)
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index c9a697c..4159e26 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -354,6 +354,7 @@ class IngestFileWorker(SandcrawlerWorker):
                     return result
                 next_url = fulltext_url.get('pdf_url') or fulltext_url.get('next_url')
                 assert next_url
+                next_url = clean_url(next_url)
                 print("[PARSE\t] {}\t{}".format(
                         fulltext_url.get('technique'),
                         next_url,
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index d9c9d55..1b8aa92 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -10,6 +10,7 @@ import urlcanon
 
 
 def clean_url(s):
+    s = s.strip()
     parsed = urlcanon.parse_url(s)
     if not parsed.port and parsed.colon_before_port:
         parsed.colon_before_port = b''
author	Bryan Newbold <bnewbold@archive.org>	2020-03-23 10:32:47 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-03-23 10:32:50 -0700
commit	84eeefbd3c55ea31bcf552f9c129c0e1576717ae (patch)
tree	a66a381fc9535ff3ed6b3760c573b4476e3ab043
parent	e5ad7bddbcb55471b96ce30397ed85fe98e3b098 (diff)
download	sandcrawler-84eeefbd3c55ea31bcf552f9c129c0e1576717ae.tar.gz sandcrawler-84eeefbd3c55ea31bcf552f9c129c0e1576717ae.zip