aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-10-24 14:17:44 -0700
committerBryan Newbold <bnewbold@archive.org>2022-10-24 14:17:46 -0700
commit4f0d10f4b38534eda673a8dfe28e3a58af9a8a8a (patch)
tree87c84d496a9976084fc4af7825e549c07fbcffb9
parent855153ae4fe03656adde16c56a4347f4b3d26487 (diff)
downloadsandcrawler-4f0d10f4b38534eda673a8dfe28e3a58af9a8a8a.tar.gz
sandcrawler-4f0d10f4b38534eda673a8dfe28e3a58af9a8a8a.zip
ingest: don't prefer WARC over SPN so strongly
We generally prefer an older WARC record over an SPN record, because the lookup is easier. But, this was causing problems with repeated ingest, so demote it. We may want to make this more configurable in the future, so things like HTML sub-resource lookups or bulk ingest won't prefer random new SPN captures.
-rw-r--r--python/sandcrawler/ia.py3
1 files changed, 2 insertions, 1 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 657bee6..672a0b6 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -367,8 +367,9 @@ class CdxApiClient:
int(r.mimetype == best_mimetype),
int(r.mimetype != "warc/revisit"),
r.datetime[:4] == closest_dt[:4],
- int("/" in r.warc_path),
int(r.datetime),
+ # NOTE: previously we demoted SPN records with this warc_path check ahead of datetime
+ int("/" in r.warc_path),
)
rows = sorted(rows, key=_cdx_sort_key)