aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/sandcrawler/ia.py5
-rw-r--r--python/sandcrawler/ingest.py1
-rw-r--r--python/sandcrawler/misc.py1
3 files changed, 6 insertions, 1 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 25697be..cc176d0 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -18,7 +18,7 @@ from http.client import IncompleteRead
from wayback.resourcestore import ResourceStore
from gwb.loader import CDXLoaderFactory
-from .misc import b32_hex, requests_retry_session, gen_file_metadata
+from .misc import b32_hex, requests_retry_session, gen_file_metadata, clean_url
class SandcrawlerBackoffError(Exception):
"""
@@ -543,6 +543,7 @@ class WaybackClient:
if redirect_url and redirect_url.startswith("https://web.archive.org/web/"):
redirect_url = "/".join(redirect_url.split("/")[5:])
#print(redirect_url, file=sys.stderr)
+ redirect_url = clean_url(redirect_url)
if redirect_url and redirect_url.startswith("http"):
return redirect_url
else:
@@ -666,11 +667,13 @@ class WaybackClient:
next_url = domain_prefix + resource.location
else:
next_url = resource.location
+ next_url = clean_url(next_url)
else:
next_url = self.fetch_replay_redirect(
url=cdx_row.url,
datetime=cdx_row.datetime,
)
+ next_url = clean_url(next_url)
cdx_row = cdx_partial_from_row(cdx_row)
if not next_url:
print("bad redirect record: {}".format(cdx_row), file=sys.stderr)
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index c9a697c..4159e26 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -354,6 +354,7 @@ class IngestFileWorker(SandcrawlerWorker):
return result
next_url = fulltext_url.get('pdf_url') or fulltext_url.get('next_url')
assert next_url
+ next_url = clean_url(next_url)
print("[PARSE\t] {}\t{}".format(
fulltext_url.get('technique'),
next_url,
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index d9c9d55..1b8aa92 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -10,6 +10,7 @@ import urlcanon
def clean_url(s):
+ s = s.strip()
parsed = urlcanon.parse_url(s)
if not parsed.port and parsed.colon_before_port:
parsed.colon_before_port = b''