aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-07-14 15:03:49 -0700
committerBryan Newbold <bnewbold@archive.org>2022-07-14 15:03:51 -0700
commitb5217753166956eed14cf2c91ec52d883d6a5a56 (patch)
tree758026fb0061d66e49fede1b3ef451d56ab8ac93
parentb680c255508e6721185c6793bc872c0dc97864a0 (diff)
downloadsandcrawler-b5217753166956eed14cf2c91ec52d883d6a5a56.tar.gz
sandcrawler-b5217753166956eed14cf2c91ec52d883d6a5a56.zip
cdx lookups: prioritize truely exact URL matches
This hopefully resolves an issue causing many apparent redirect loops, which were actually timing or HTTP status code near-loops with http/https fuzzy matching in CDX API. Despite "exact" API lookup semantics.
-rw-r--r--python/sandcrawler/ia.py1
1 files changed, 1 insertions, 0 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 7b9427e..bb67c87 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -345,6 +345,7 @@ class CdxApiClient:
*reverse* order.
"""
return (
+ int(r.url == url),
int(r.status_code in (200, 226)),
int(0 - (r.status_code or 999)),
int(r.mimetype == best_mimetype),