diff options
Diffstat (limited to 'python')
| -rw-r--r-- | python/sandcrawler/ia.py | 5 | ||||
| -rw-r--r-- | python/sandcrawler/ingest.py | 1 | ||||
| -rw-r--r-- | python/sandcrawler/misc.py | 1 | 
3 files changed, 6 insertions, 1 deletions
| diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 25697be..cc176d0 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -18,7 +18,7 @@ from http.client import IncompleteRead  from wayback.resourcestore import ResourceStore  from gwb.loader import CDXLoaderFactory -from .misc import b32_hex, requests_retry_session, gen_file_metadata +from .misc import b32_hex, requests_retry_session, gen_file_metadata, clean_url  class SandcrawlerBackoffError(Exception):      """ @@ -543,6 +543,7 @@ class WaybackClient:          if redirect_url and redirect_url.startswith("https://web.archive.org/web/"):              redirect_url = "/".join(redirect_url.split("/")[5:])          #print(redirect_url, file=sys.stderr) +        redirect_url = clean_url(redirect_url)          if redirect_url and redirect_url.startswith("http"):              return redirect_url          else: @@ -666,11 +667,13 @@ class WaybackClient:                          next_url = domain_prefix + resource.location                      else:                          next_url = resource.location +                    next_url = clean_url(next_url)                  else:                      next_url = self.fetch_replay_redirect(                          url=cdx_row.url,                          datetime=cdx_row.datetime,                      ) +                    next_url = clean_url(next_url)                      cdx_row = cdx_partial_from_row(cdx_row)                      if not next_url:                          print("bad redirect record: {}".format(cdx_row), file=sys.stderr) diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index c9a697c..4159e26 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -354,6 +354,7 @@ class IngestFileWorker(SandcrawlerWorker):                      return result                  next_url = fulltext_url.get('pdf_url') or fulltext_url.get('next_url')                  assert next_url +                next_url = clean_url(next_url)                  print("[PARSE\t] {}\t{}".format(                          fulltext_url.get('technique'),                          next_url, diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py index d9c9d55..1b8aa92 100644 --- a/python/sandcrawler/misc.py +++ b/python/sandcrawler/misc.py @@ -10,6 +10,7 @@ import urlcanon  def clean_url(s): +    s = s.strip()      parsed = urlcanon.parse_url(s)      if not parsed.port and parsed.colon_before_port:          parsed.colon_before_port = b'' | 
