From 1b5ee74818da93fd80201a60a18632ff28692d91 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 30 Sep 2021 18:47:17 -0700 Subject: ingest CDX lookup: weigh year+month of capture against in-petabox-or-not This is to try working around an issue where ingests fail because an SPN capture is much newer, but the old sorting preference ignored that. Note that the sorting logic is pretty busted anyways, and we should probably allow returning multiple matching files to try. --- python/sandcrawler/ia.py | 1 + 1 file changed, 1 insertion(+) (limited to 'python') diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index a5d19cd..c586972 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -297,6 +297,7 @@ class CdxApiClient: int(0 - (r.status_code or 999)), int(r.mimetype == best_mimetype), int(r.mimetype != "warc/revisit"), + int(r.datetime[:6]), int('/' in r.warc_path), int(r.datetime), ) -- cgit v1.2.3