diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-10-19 15:46:37 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-10-19 15:46:39 -0700 |
commit | b672a6fe5b0e51f9d2844443bf9f7e82e1fd41b1 (patch) | |
tree | 82e03127ff94c9fb1c0d1807f9f76f367a0f37de | |
parent | cc26ea975e29eefa2e2d3565c55ba0ac0a491bb7 (diff) | |
download | sandcrawler-b672a6fe5b0e51f9d2844443bf9f7e82e1fd41b1.tar.gz sandcrawler-b672a6fe5b0e51f9d2844443bf9f7e82e1fd41b1.zip |
CDX fetch: more permissive fuzzy/normalization check
This might the source of some `spn2-cdx-lookup-failure`.
Wayback/CDX does this check via full-on SURT, with many more changes,
and potentially we should be doing that here as well.
-rw-r--r-- | python/sandcrawler/ia.py | 12 |
1 files changed, 9 insertions, 3 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index ea29e67..426307a 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -106,18 +106,24 @@ def fuzzy_match_url(left, right): if left == right: return True if '://' in left and '://' in right: - if left.split('://')[1:] == right.split('://')[1:]: - return True + left = '://'.join(left.split('://')[1:]) + right = '://'.join(right.split('://')[1:]) + if left == right: + return True + if left == right + "/" or right == left + "/": + return True return False def test_fuzzy_match_url(): assert fuzzy_match_url("http://thing.com", "http://thing.com") == True assert fuzzy_match_url("http://thing.com", "https://thing.com") == True assert fuzzy_match_url("http://thing.com", "ftp://thing.com") == True + assert fuzzy_match_url("http://thing.com", "http://thing.com/") == True + assert fuzzy_match_url("https://thing.com", "http://thing.com/") == True + assert fuzzy_match_url("https://thing.com/", "http://thing.com") == True assert fuzzy_match_url("http://thing.com", "http://thing.com/blue") == False # should probably handle these? - assert fuzzy_match_url("http://thing.com", "http://thing.com/") == False assert fuzzy_match_url("http://thing.com", "http://www.thing.com") == False assert fuzzy_match_url("http://www.thing.com", "http://www2.thing.com") == False assert fuzzy_match_url("http://www.thing.com", "https://www2.thing.com") == False |