aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ia.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/ia.py')
-rw-r--r--python/sandcrawler/ia.py12
1 files changed, 9 insertions, 3 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index ea29e67..426307a 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -106,18 +106,24 @@ def fuzzy_match_url(left, right):
if left == right:
return True
if '://' in left and '://' in right:
- if left.split('://')[1:] == right.split('://')[1:]:
- return True
+ left = '://'.join(left.split('://')[1:])
+ right = '://'.join(right.split('://')[1:])
+ if left == right:
+ return True
+ if left == right + "/" or right == left + "/":
+ return True
return False
def test_fuzzy_match_url():
assert fuzzy_match_url("http://thing.com", "http://thing.com") == True
assert fuzzy_match_url("http://thing.com", "https://thing.com") == True
assert fuzzy_match_url("http://thing.com", "ftp://thing.com") == True
+ assert fuzzy_match_url("http://thing.com", "http://thing.com/") == True
+ assert fuzzy_match_url("https://thing.com", "http://thing.com/") == True
+ assert fuzzy_match_url("https://thing.com/", "http://thing.com") == True
assert fuzzy_match_url("http://thing.com", "http://thing.com/blue") == False
# should probably handle these?
- assert fuzzy_match_url("http://thing.com", "http://thing.com/") == False
assert fuzzy_match_url("http://thing.com", "http://www.thing.com") == False
assert fuzzy_match_url("http://www.thing.com", "http://www2.thing.com") == False
assert fuzzy_match_url("http://www.thing.com", "https://www2.thing.com") == False