From 0c0f9714724e65c0b12ac9c76132c6ab1590e823 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 15 Jul 2022 14:20:48 -0700 Subject: cdx api: add another allowable URL fuzzy-match pattern (double slashes) --- python/sandcrawler/ia.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 51326fa..227f7d0 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -136,6 +136,8 @@ def fuzzy_match_url(left: str, right: str) -> bool: return True if left == right + "/" or right == left + "/": return True + if left.replace("//", "/") == right.replace("//", "/"): + return True return False @@ -147,6 +149,13 @@ def test_fuzzy_match_url() -> None: assert fuzzy_match_url("https://thing.com", "http://thing.com/") is True assert fuzzy_match_url("https://thing.com/", "http://thing.com") is True assert fuzzy_match_url("http://thing.com", "http://thing.com/blue") is False + assert ( + fuzzy_match_url( + "https://www.cairn.info/static/images//logo-partners/logo-cnl-negatif.png", + "https://www.cairn.info/static/images/logo-partners/logo-cnl-negatif.png", + ) + is True + ) # should probably handle these? assert fuzzy_match_url("http://thing.com", "http://www.thing.com") is False -- cgit v1.2.3