aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-07-16 13:08:41 -0700
committerBryan Newbold <bnewbold@archive.org>2022-07-16 13:08:41 -0700
commit50893d02d205622668098890b0795f9168198caa (patch)
tree341fec400d86874168ae1a18e1b6b6237bcc961a /python/sandcrawler
parentda12c99e0d9cdbdc8868a94f8d78b6cd3b2653fa (diff)
downloadsandcrawler-50893d02d205622668098890b0795f9168198caa.tar.gz
sandcrawler-50893d02d205622668098890b0795f9168198caa.zip
cdx: tweak CDX lookups and resolution (sort)
Diffstat (limited to 'python/sandcrawler')
-rw-r--r--python/sandcrawler/ia.py11
1 files changed, 7 insertions, 4 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index e08031e..65f30e7 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -325,7 +325,7 @@ class CdxApiClient:
params: Dict[str, str] = {
"url": url,
"matchType": "exact",
- "limit": "-25",
+ "limit": "-40",
"output": "json",
# Collapsing seems efficient, but is complex; would need to include
# other filters and status code in filter
@@ -336,11 +336,14 @@ class CdxApiClient:
if max_age_days:
since = datetime.date.today() - datetime.timedelta(days=max_age_days)
params["from"] = "%04d%02d%02d" % (since.year, since.month, since.day)
+ closest_dt = "00000000"
if closest:
if isinstance(closest, datetime.datetime):
- params["closest"] = "%04d%02d%02d" % (closest.year, closest.month, closest.day)
+ closest_dt = "%04d%02d%02d" % (closest.year, closest.month, closest.day)
+ params["closest"] = closest_dt
else:
- params["closest"] = closest
+ closest_dt = closest
+ params["closest"] = closest_dt
params["sort"] = "closest"
# print(params, file=sys.stderr)
rows = self._query_api(params)
@@ -359,7 +362,7 @@ class CdxApiClient:
int(0 - (r.status_code or 999)),
int(r.mimetype == best_mimetype),
int(r.mimetype != "warc/revisit"),
- int(r.datetime[:6]),
+ r.datetime[:4] == closest_dt[:4],
int("/" in r.warc_path),
int(r.datetime),
)