From 8837977d2892beac6cf412f58dafcdbf06f323ac Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Tue, 10 Mar 2020 22:40:00 -0700
Subject: url cleaning (canonicalization) for ingest base_url

As mentioned in comment, this first version does not re-write the URL in
the `base_url` field. If we did so, then ingest_request rows would not
SQL JOIN to ingest_file_result rows, which we wouldn't want.

In the future, behaviour should maybe be to refuse to process URLs that
aren't clean (eg, if base_url != clean_url(base_url)) and return a
'bad-url' status or soemthing. Then we would only accept clean URLs in
both tables, and clear out all old/bad URLs with a cleanup script.
---
 python/sandcrawler/misc.py | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'python/sandcrawler/misc.py')

diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index 88669e6..d9c9d55 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -6,8 +6,15 @@ import datetime
 import requests
 from requests.adapters import HTTPAdapter
 from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
+import urlcanon
 
 
+def clean_url(s):
+    parsed = urlcanon.parse_url(s)
+    if not parsed.port and parsed.colon_before_port:
+        parsed.colon_before_port = b''
+    return str(urlcanon.whatwg(parsed))
+
 def gen_file_metadata(blob):
     """
     Takes a file blob (bytestream) and returns hashes and other metadata.
-- 
cgit v1.2.3