From 50893d02d205622668098890b0795f9168198caa Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sat, 16 Jul 2022 13:08:41 -0700 Subject: cdx: tweak CDX lookups and resolution (sort) --- python/sandcrawler/ia.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'python/sandcrawler') diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index e08031e..65f30e7 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -325,7 +325,7 @@ class CdxApiClient: params: Dict[str, str] = { "url": url, "matchType": "exact", - "limit": "-25", + "limit": "-40", "output": "json", # Collapsing seems efficient, but is complex; would need to include # other filters and status code in filter @@ -336,11 +336,14 @@ class CdxApiClient: if max_age_days: since = datetime.date.today() - datetime.timedelta(days=max_age_days) params["from"] = "%04d%02d%02d" % (since.year, since.month, since.day) + closest_dt = "00000000" if closest: if isinstance(closest, datetime.datetime): - params["closest"] = "%04d%02d%02d" % (closest.year, closest.month, closest.day) + closest_dt = "%04d%02d%02d" % (closest.year, closest.month, closest.day) + params["closest"] = closest_dt else: - params["closest"] = closest + closest_dt = closest + params["closest"] = closest_dt params["sort"] = "closest" # print(params, file=sys.stderr) rows = self._query_api(params) @@ -359,7 +362,7 @@ class CdxApiClient: int(0 - (r.status_code or 999)), int(r.mimetype == best_mimetype), int(r.mimetype != "warc/revisit"), - int(r.datetime[:6]), + r.datetime[:4] == closest_dt[:4], int("/" in r.warc_path), int(r.datetime), ) -- cgit v1.2.3