aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-10-19 15:46:37 -0700
committerBryan Newbold <bnewbold@archive.org>2020-10-19 15:46:39 -0700
commitb672a6fe5b0e51f9d2844443bf9f7e82e1fd41b1 (patch)
tree82e03127ff94c9fb1c0d1807f9f76f367a0f37de
parentcc26ea975e29eefa2e2d3565c55ba0ac0a491bb7 (diff)
downloadsandcrawler-b672a6fe5b0e51f9d2844443bf9f7e82e1fd41b1.tar.gz
sandcrawler-b672a6fe5b0e51f9d2844443bf9f7e82e1fd41b1.zip
CDX fetch: more permissive fuzzy/normalization check
This might the source of some `spn2-cdx-lookup-failure`. Wayback/CDX does this check via full-on SURT, with many more changes, and potentially we should be doing that here as well.
-rw-r--r--python/sandcrawler/ia.py12
1 files changed, 9 insertions, 3 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index ea29e67..426307a 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -106,18 +106,24 @@ def fuzzy_match_url(left, right):
if left == right:
return True
if '://' in left and '://' in right:
- if left.split('://')[1:] == right.split('://')[1:]:
- return True
+ left = '://'.join(left.split('://')[1:])
+ right = '://'.join(right.split('://')[1:])
+ if left == right:
+ return True
+ if left == right + "/" or right == left + "/":
+ return True
return False
def test_fuzzy_match_url():
assert fuzzy_match_url("http://thing.com", "http://thing.com") == True
assert fuzzy_match_url("http://thing.com", "https://thing.com") == True
assert fuzzy_match_url("http://thing.com", "ftp://thing.com") == True
+ assert fuzzy_match_url("http://thing.com", "http://thing.com/") == True
+ assert fuzzy_match_url("https://thing.com", "http://thing.com/") == True
+ assert fuzzy_match_url("https://thing.com/", "http://thing.com") == True
assert fuzzy_match_url("http://thing.com", "http://thing.com/blue") == False
# should probably handle these?
- assert fuzzy_match_url("http://thing.com", "http://thing.com/") == False
assert fuzzy_match_url("http://thing.com", "http://www.thing.com") == False
assert fuzzy_match_url("http://www.thing.com", "http://www2.thing.com") == False
assert fuzzy_match_url("http://www.thing.com", "https://www2.thing.com") == False