aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-07-15 14:20:48 -0700
committerBryan Newbold <bnewbold@archive.org>2022-07-15 14:20:48 -0700
commit0c0f9714724e65c0b12ac9c76132c6ab1590e823 (patch)
tree90715e6de04bdf006a5b9fcf1717a4bed7b0d16a
parent1690a1ef4704da2c71fbda9b83cd7b1f7e7199d8 (diff)
downloadsandcrawler-0c0f9714724e65c0b12ac9c76132c6ab1590e823.tar.gz
sandcrawler-0c0f9714724e65c0b12ac9c76132c6ab1590e823.zip
cdx api: add another allowable URL fuzzy-match pattern (double slashes)
-rw-r--r--python/sandcrawler/ia.py9
1 files changed, 9 insertions, 0 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 51326fa..227f7d0 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -136,6 +136,8 @@ def fuzzy_match_url(left: str, right: str) -> bool:
return True
if left == right + "/" or right == left + "/":
return True
+ if left.replace("//", "/") == right.replace("//", "/"):
+ return True
return False
@@ -147,6 +149,13 @@ def test_fuzzy_match_url() -> None:
assert fuzzy_match_url("https://thing.com", "http://thing.com/") is True
assert fuzzy_match_url("https://thing.com/", "http://thing.com") is True
assert fuzzy_match_url("http://thing.com", "http://thing.com/blue") is False
+ assert (
+ fuzzy_match_url(
+ "https://www.cairn.info/static/images//logo-partners/logo-cnl-negatif.png",
+ "https://www.cairn.info/static/images/logo-partners/logo-cnl-negatif.png",
+ )
+ is True
+ )
# should probably handle these?
assert fuzzy_match_url("http://thing.com", "http://www.thing.com") is False