From 84eeefbd3c55ea31bcf552f9c129c0e1576717ae Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 23 Mar 2020 10:32:47 -0700 Subject: ingest: clean_url() in more places Some 'cdx-error' results were due to URLs with ':' after the hostname or trailing newline ("\n") characters in the URL. This attempts to work around this categroy of error. --- python/sandcrawler/ia.py | 5 ++++- python/sandcrawler/ingest.py | 1 + python/sandcrawler/misc.py | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) (limited to 'python/sandcrawler') diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 25697be..cc176d0 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -18,7 +18,7 @@ from http.client import IncompleteRead from wayback.resourcestore import ResourceStore from gwb.loader import CDXLoaderFactory -from .misc import b32_hex, requests_retry_session, gen_file_metadata +from .misc import b32_hex, requests_retry_session, gen_file_metadata, clean_url class SandcrawlerBackoffError(Exception): """ @@ -543,6 +543,7 @@ class WaybackClient: if redirect_url and redirect_url.startswith("https://web.archive.org/web/"): redirect_url = "/".join(redirect_url.split("/")[5:]) #print(redirect_url, file=sys.stderr) + redirect_url = clean_url(redirect_url) if redirect_url and redirect_url.startswith("http"): return redirect_url else: @@ -666,11 +667,13 @@ class WaybackClient: next_url = domain_prefix + resource.location else: next_url = resource.location + next_url = clean_url(next_url) else: next_url = self.fetch_replay_redirect( url=cdx_row.url, datetime=cdx_row.datetime, ) + next_url = clean_url(next_url) cdx_row = cdx_partial_from_row(cdx_row) if not next_url: print("bad redirect record: {}".format(cdx_row), file=sys.stderr) diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index c9a697c..4159e26 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -354,6 +354,7 @@ class IngestFileWorker(SandcrawlerWorker): return result next_url = fulltext_url.get('pdf_url') or fulltext_url.get('next_url') assert next_url + next_url = clean_url(next_url) print("[PARSE\t] {}\t{}".format( fulltext_url.get('technique'), next_url, diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py index d9c9d55..1b8aa92 100644 --- a/python/sandcrawler/misc.py +++ b/python/sandcrawler/misc.py @@ -10,6 +10,7 @@ import urlcanon def clean_url(s): + s = s.strip() parsed = urlcanon.parse_url(s) if not parsed.port and parsed.colon_before_port: parsed.colon_before_port = b'' -- cgit v1.2.3