From b5217753166956eed14cf2c91ec52d883d6a5a56 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 14 Jul 2022 15:03:49 -0700 Subject: cdx lookups: prioritize truely exact URL matches This hopefully resolves an issue causing many apparent redirect loops, which were actually timing or HTTP status code near-loops with http/https fuzzy matching in CDX API. Despite "exact" API lookup semantics. --- python/sandcrawler/ia.py | 1 + 1 file changed, 1 insertion(+) (limited to 'python/sandcrawler') diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 7b9427e..bb67c87 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -345,6 +345,7 @@ class CdxApiClient: *reverse* order. """ return ( + int(r.url == url), int(r.status_code in (200, 226)), int(0 - (r.status_code or 999)), int(r.mimetype == best_mimetype), -- cgit v1.2.3