From 4f0d10f4b38534eda673a8dfe28e3a58af9a8a8a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 24 Oct 2022 14:17:44 -0700 Subject: ingest: don't prefer WARC over SPN so strongly We generally prefer an older WARC record over an SPN record, because the lookup is easier. But, this was causing problems with repeated ingest, so demote it. We may want to make this more configurable in the future, so things like HTML sub-resource lookups or bulk ingest won't prefer random new SPN captures. --- python/sandcrawler/ia.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 657bee6..672a0b6 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -367,8 +367,9 @@ class CdxApiClient: int(r.mimetype == best_mimetype), int(r.mimetype != "warc/revisit"), r.datetime[:4] == closest_dt[:4], - int("/" in r.warc_path), int(r.datetime), + # NOTE: previously we demoted SPN records with this warc_path check ahead of datetime + int("/" in r.warc_path), ) rows = sorted(rows, key=_cdx_sort_key) -- cgit v1.2.3