From b672a6fe5b0e51f9d2844443bf9f7e82e1fd41b1 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 19 Oct 2020 15:46:37 -0700 Subject: CDX fetch: more permissive fuzzy/normalization check This might the source of some `spn2-cdx-lookup-failure`. Wayback/CDX does this check via full-on SURT, with many more changes, and potentially we should be doing that here as well. --- python/sandcrawler/ia.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'python/sandcrawler/ia.py') diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index ea29e67..426307a 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -106,18 +106,24 @@ def fuzzy_match_url(left, right): if left == right: return True if '://' in left and '://' in right: - if left.split('://')[1:] == right.split('://')[1:]: - return True + left = '://'.join(left.split('://')[1:]) + right = '://'.join(right.split('://')[1:]) + if left == right: + return True + if left == right + "/" or right == left + "/": + return True return False def test_fuzzy_match_url(): assert fuzzy_match_url("http://thing.com", "http://thing.com") == True assert fuzzy_match_url("http://thing.com", "https://thing.com") == True assert fuzzy_match_url("http://thing.com", "ftp://thing.com") == True + assert fuzzy_match_url("http://thing.com", "http://thing.com/") == True + assert fuzzy_match_url("https://thing.com", "http://thing.com/") == True + assert fuzzy_match_url("https://thing.com/", "http://thing.com") == True assert fuzzy_match_url("http://thing.com", "http://thing.com/blue") == False # should probably handle these? - assert fuzzy_match_url("http://thing.com", "http://thing.com/") == False assert fuzzy_match_url("http://thing.com", "http://www.thing.com") == False assert fuzzy_match_url("http://www.thing.com", "http://www2.thing.com") == False assert fuzzy_match_url("http://www.thing.com", "https://www2.thing.com") == False -- cgit v1.2.3